diff --git a/bots/markpages.py b/bots/markpages.py index b7b45c0..0fbaded 100644 --- a/bots/markpages.py +++ b/bots/markpages.py @@ -87,6 +87,8 @@ class MarkPagesBot( CurrentPageBot ): # sets 'current_page' on each treat() for redfam in self.redfams: redfam.update_status() + RedFamWorker.flush_db_cache() + @property def redfams(self): """ @@ -168,7 +170,7 @@ class MarkPagesBot( CurrentPageBot ): # sets 'current_page' on each treat() save_ret = self.put_current( self.new_text, summary=summary ) # Status - if add_ret is None or add_ret and save_ret: + if add_ret is None or ( add_ret and save_ret ): self.current_page.redfam.article_add_status( "marked", title=self.current_page.title(withNamespace=False)) diff --git a/bots/reddiscparser.py b/bots/reddiscparser.py index 818eb05..9179841 100644 --- a/bots/reddiscparser.py +++ b/bots/reddiscparser.py @@ -33,8 +33,8 @@ from pywikibot.bot import ExistingPageBot, NoRedirectPageBot import jogobot -from lib import redpage -from lib import redfam +from lib.redpage import RedPage +from lib.redfam import RedFamParser class DiscussionParserBot( @@ -127,7 +127,7 @@ class DiscussionParserBot( else: # If successfully parsed all pages in cat, flush db write cache - redpage.RedPage.flush_db_cache() + RedPage.flush_db_cache() def treat_page( self ): """ @@ -146,20 +146,23 @@ class DiscussionParserBot( return # Initiate RedPage object - red_page = redpage.RedPage( self.current_page ) + redpage = RedPage.session.query(RedPage).filter( + RedPage.pageid == self.current_page.pageid ).one_or_none() - # Check whether parsing is needed - if red_page.is_parsing_needed(): + if redpage: + redpage.update( self.current_page ) + else: + redpage = RedPage( self.current_page ) + # Check whether parsing is needed + if redpage.is_parsing_needed(): # Count families for failure analysis fam_counter = 0 # Iterate over returned generator with redfam sections - for fam in red_page.parse(): - + for fam in redpage.parse(): # Run RedFamParser on section text - redfam.RedFamParser.parser( fam, red_page.page, - red_page.is_archive() ) + RedFamParser.parser( fam, redpage, redpage.archive ) fam_counter += 1 @@ -167,12 +170,13 @@ class DiscussionParserBot( # If successfully parsed whole page, flush # db write cache if( fam_counter ): - redfam.RedFamParser.flush_db_cache() + + RedFamParser.flush_db_cache() jogobot.output( "Page [[{reddisc}]] parsed".format( - reddisc=red_page.page.title() ) ) + reddisc=redpage.page.title() ) ) else: jogobot.output( "\03{red}" + "Page [[{reddisc}]], ".format( - reddisc=red_page.page.title() ) + + reddisc=redpage.page.title() ) + "containing no redfam, parsed!", "WARNING" ) diff --git a/lib/mysqlred.py b/lib/mysqlred.py index 9e2e01b..1760fda 100644 --- a/lib/mysqlred.py +++ b/lib/mysqlred.py @@ -25,350 +25,300 @@ Provides interface classes for communication of redundances bot with mysql-db """ -# Prefere using oursql then MySQLdb -try: - import oursql as mysqldb -except ImportError: - import MySQLdb as mysqldb +import atexit # noqa -import atexit - -import pywikibot +import pywikibot # noqa from pywikibot import config import jogobot +from sqlalchemy import ( + create_engine, Column, Integer, String, Text, DateTime, ForeignKey ) +from sqlalchemy import text # noqa +from sqlalchemy.engine.url import URL +from sqlalchemy.ext.declarative import ( + declarative_base, declared_attr, has_inherited_table ) +from sqlalchemy.ext.mutable import MutableComposite, MutableSet +from sqlalchemy.orm import sessionmaker, relationship, composite +from sqlalchemy.orm.collections import attribute_mapped_collection +import sqlalchemy.types as types -class MysqlRed: - """ - Basic interface class, containing opening of connection - Specific querys should be defined in descendant classes per data type - """ +Base = declarative_base() - # Save mysqldb-connection as class attribute to use only one - # in descendant classes - connection = False - db_hostname = config.db_hostname - db_port = config.db_port - db_username = config.db_username - db_password = config.db_password - db_name = config.db_username + jogobot.config['db_suffix'] - db_table_prefix = False - - # Class variables for storing cached querys - _cached_update_data = [] - _update_query = '' - _cached_insert_data = {} - _insert_query = '' - - def __init__( self ): - """ - Opens a connection to MySQL-DB +url = URL( "mysql+oursql", + username=config.db_username, + password=config.db_password, + host=config.db_hostname, + port=config.db_port, + database=config.db_username + jogobot.config['db_suffix'] ) +engine = create_engine(url, echo=True) - @returns mysql-stream MySQL Connection - """ - - # Needs to be generated after Parsing of Args (not at import time) - if not type(self).db_table_prefix: - type(self).db_table_prefix = \ - pywikibot.Site().family.dbName(pywikibot.Site().code) - # Now we can setup prepared queries - self._prepare_queries() +Session = sessionmaker(bind=engine) +session = Session() - # Connect to mysqldb only once - if not type( self ).connection: +family = pywikibot.Site().family.dbName(pywikibot.Site().code) - type( self ).connection = mysqldb.connect( - host=type( self ).db_hostname, - port=type( self ).db_port, - user=type( self ).db_username, - passwd=type( self ).db_password, - db=type( self ).db_name ) - # Register callback for warnig if exit with cached db write querys - atexit.register( type(self).warn_if_not_flushed ) +class Mysql(object): + session = session - def __del__( self ): - """ - Before deleting class, close connection to MySQL-DB - """ + @declared_attr + def _tableprefix(cls): + return family + "_" - type( self ).connection.close() + @declared_attr + def _tablesuffix(cls): + return "s" - def _prepare_queries( self ): - """ - Used to replace placeholders in prepared queries - """ - type(self)._update_query = type(self)._update_query.format( - prefix=type(self).db_table_prefix) - type(self)._insert_query = type(self)._insert_query.format( - prefix=type(self).db_table_prefix) + @declared_attr + def __tablename__(cls): + if has_inherited_table(cls): + return None + name = cls.__name__[len("Mysql"):].lower() + return cls._tableprefix + name + cls._tablesuffix - @classmethod - def flush( cls ): - """ - Run cached querys - """ - if not cls.connection: - raise MysqlRedConnectionError( "No connection exists!" ) - - cursor = cls.connection.cursor() - - # Execute insert query - if cls._cached_insert_data: - # Since cls._cached_insert_data is a dict, we need to have a custom - # Generator to iterate over it - cursor.executemany( cls._insert_query, - ( cls._cached_insert_data[ key ] - for key in cls._cached_insert_data ) ) - # Reset after writing - cls._cached_insert_data = {} - - # Execute update query - # Use executemany since update could not be reduced to one query - if cls._cached_update_data: - cursor.executemany( cls._update_query, cls._cached_update_data ) - # Reset after writing - cls._cached_update_data = [] - - # Commit db changes - if cls._cached_insert_data or cls._cached_update_data: - cls.connection.commit() - - @classmethod - def warn_if_not_flushed(cls): - """ - Outputs a warning if there are db write querys cached and not flushed - before exiting programm! - """ - if cls._cached_update_data or cls._cached_insert_data: - jogobot.output( "Cached Database write querys not flushed!!! " + - "Data loss is possible!", "WARNING" ) + def changedp(self): + return self.session.is_modified(self) -class MysqlRedPage( MysqlRed ): +class MutableSet(MutableSet): """ - MySQL-db Interface for handling querys for RedPages + Extended version of the mutable set for our states """ - # Class variables for storing cached querys - # '{prefix}' will be replaced during super().__init__() - _cached_update_data = [] - _update_query = 'UPDATE `{prefix}_redpages` \ -SET `pagetitle` = ?, `revid` = ?, `status`= ? WHERE `pageid` = ?;' - - _cached_insert_data = {} - _insert_query = 'INSERT INTO `{prefix}_redpages` \ -( pageid, pagetitle, revid, status ) VALUES ( ?, ?, ?, ? );' - - def __init__( self, pageid ): + def has(self, item): """ - Creates a new instance, runs __init__ of parent class - """ - - super().__init__( ) + Check if item is in set - self.__pageid = int( pageid ) - - self.data = self.get_page() - - def __del__( self ): - """ - Needed to prevent descendant classes of MYSQL_RED from deleting - connection to db + @param item Item to check """ - pass + return item in self - def get_page( self ): + def add(self, item): """ - Retrieves a red page row from MySQL-Database for given page_id + Extended add method, which only result in changed object if there is + really an item added. - @param int pageid MediaWiki page_id for page to retrieve - - @returns tuple Tuple with data for given page_id - bool FALSE if none found + @param item Item to add """ + if item not in self: + super().add(item) - cursor = type( self ).connection.cursor(mysqldb.DictCursor) - - cursor.execute( - 'SELECT * FROM `{prefix}_redpages` WHERE `pageid` = ?;'.format( - prefix=type(self).db_table_prefix), ( self.__pageid, ) ) - - res = cursor.fetchone() - - if res: - return res - else: - return False - - def add_page( self, pagetitle, revid, status=0 ): + def discard(self, item): """ - Inserts a red page row in MySQL-Database for given pageid + Wrapper for extended remove below - @param int revid MediaWiki current revid - @param str pagetitle MediaWiki new pagetitle - @param int status Page parsing status + @param item Item to discard """ + self.remove(item) - insert_data = { self.__pageid: ( self.__pageid, pagetitle, - revid, status ) } - - type( self )._cached_insert_data.update( insert_data ) - - # Manualy construct self.data dict - self.data = { 'pageid': self.__pageid, 'revid': revid, - 'pagetitle': pagetitle, 'status': status } - - def update_page( self, revid=None, pagetitle=None, status=0 ): + def remove(self, item, weak=True ): """ - Updates the red page row in MySQL-Database for given page_id + Extended remove method, which only results in changed object if there + is really an item removed. Additionally, combine remove and discard! - @param int revid MediaWiki current rev_id - @param str pagetitle MediaWiki new page_title - @param int status Page parsing status + @param item Item to remove/discard + @param weak Set to false to use remove, else discard behavior """ + if item in self: + if weak: + super().discard(item) + else: + super().remove(item) - if not pagetitle: - pagetitle = self.data[ 'pagetitle' ] - if not revid: - revid = self.data[ 'revid' ] - - type( self )._cached_update_data.append( ( pagetitle, revid, - status, self.__pageid ) ) - -class MysqlRedFam( MysqlRed ): +class ColumnList( list, MutableComposite ): """ - MySQL-db Interface for handling querys for RedFams + Combines multiple Colums into a list like object """ - # Class variables for storing cached querys - _cached_update_data = [] - _update_query = 'UPDATE `{prefix}_redfams` \ -SET `redpageid` = ?, `heading` = ?, `beginning` = ?, `ending` = ?, \ -`status`= ? WHERE `famhash` = ?;' - - _cached_insert_data = {} - _insert_query = 'INSERT INTO `{prefix}_redfams` \ -( famhash, redpageid, beginning, ending, status, heading, \ -article0, article1, article2, article3, article4, article5, article6, \ -article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );' - - def __init__( self, famhash=None ): + def __init__( self, *columns ): """ - Creates a new instance, runs __init__ of parent class + Wrapper to the list constructor deciding whether we have initialization + with individual params per article or with an iterable. """ + # Individual params per article (from db), first one is a str + if isinstance( columns[0], str ) or \ + isinstance( columns[0], MutableSet ) or columns[0] is None: + super().__init__( columns ) + # Iterable articles list + else: + super().__init__( columns[0] ) - self.__famhash = famhash - - super().__init__( ) - - def __del__( self ): - """ - Needed to prevent descendant classes of MYSQL_RED from deleting - connection to db - """ - pass - - def get_fam( self, famhash ): + def __setitem__(self, key, value): """ - Retrieves a red family row from MySQL-Database for given fam_hash - - @returns dict Dictionairy with data for given fam hash - False if none found + The MutableComposite class needs to be noticed about changes in our + component. So we tweak the setitem process. """ - self.__famhash = famhash - cursor = type( self ).connection.cursor( mysqldb.DictCursor ) + # set the item + super().__setitem__( key, value) - cursor.execute( - 'SELECT * FROM `{prefix}_redfams` WHERE `famhash` = ?;'. - format( prefix=type(self).db_table_prefix), ( famhash, ) ) + # alert all parents to the change + self.changed() - self.data = cursor.fetchone() - - def add_fam( self, articlesList, heading, redpageid, - beginning, ending=None, status=0 ): - - data = [ self.__famhash, redpageid, beginning, ending, - status, heading ] - - for article in articlesList: - data.append( str( article ) ) + def __composite_values__(self): + """ + The Composite method needs to have this method to get the items for db. + """ + return self - while len( data ) < 14: - data.append( None ) - data = tuple( data ) +class Status( types.TypeDecorator ): - insert_data = { self.__famhash: data } - type( self )._cached_insert_data.update( insert_data ) + impl = types.String - # Manualy construct self.data dict - data_keys = ( 'famhash', 'redpageid', 'beginning', 'ending', - 'status', 'heading', 'article0', 'article1', 'article2', - 'article3', 'article4', 'article5', 'article6', - 'article7' ) - self.data = dict( zip( data_keys, data ) ) + def process_bind_param(self, value, dialect): + """ + Returns status as commaseparated string (to save in DB) - def update_fam( self, redpageid, heading, beginning, ending, status ): + @returns Raw status string + @rtype str """ - Updates the red fam row in MySQL-Database for given fam_hash + if isinstance(value, MutableSet): + return ",".join( value ) + elif isinstance(value, String ) or value is None: + return value + else: + raise TypeError( + "Value should be an instance of one of {0:s},".format( + str( [type(MutableSet()), type(String()), type(None)] ) ) + + "given value was an instance of {1:s}".format( + str(type(value))) ) - @param int redpageid MediaWiki page_id - @param datetime beginning Timestamp of beginning - qparam datetime ending Timestamp of ending of - @param int status red_fam status + def process_result_value(self, value, dialect): """ + Sets status based on comma separated list - type( self )._cached_update_data.append( ( redpageid, heading, - beginning, ending, status, - self.__famhash ) ) + @param raw_status Commaseparated string of stati (from DB) + @type raw_status str + """ + if value: + return MutableSet( value.strip().split(",")) + else: + return MutableSet([]) + + def copy(self, **kw): + return Status(self.impl.length) + + +class MysqlRedFam( Mysql, Base ): + + famhash = Column( String(64), primary_key=True, unique=True ) + + __article0 = Column('article0', String(255), nullable=False ) + __article1 = Column('article1', String(255), nullable=False ) + __article2 = Column('article2', String(255), nullable=True ) + __article3 = Column('article3', String(255), nullable=True ) + __article4 = Column('article4', String(255), nullable=True ) + __article5 = Column('article5', String(255), nullable=True ) + __article6 = Column('article6', String(255), nullable=True ) + __article7 = Column('article7', String(255), nullable=True ) + __articlesList = composite( + ColumnList, __article0, __article1, __article2, __article3, + __article4, __article5, __article6, __article7 ) + + heading = Column( Text, nullable=False ) + redpageid = Column( + Integer, ForeignKey( family + "_redpages.pageid" ), nullable=False ) + beginning = Column( DateTime, nullable=False ) + ending = Column( DateTime, nullable=True ) + _status = Column( 'status', MutableSet.as_mutable(Status(255)), + nullable=True ) + + __article0_status = Column( + 'article0_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article1_status = Column( + 'article1_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article2_status = Column( + 'article2_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article3_status = Column( + 'article3_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article4_status = Column( + 'article4_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article5_status = Column( + 'article5_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article6_status = Column( + 'article6_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __article7_status = Column( + 'article7_status', MutableSet.as_mutable(Status(64)), nullable=True ) + __articlesStatus = composite( + ColumnList, __article0_status, __article1_status, __article2_status, + __article3_status, __article4_status, __article5_status, + __article6_status, __article7_status ) + + redpage = relationship( "MysqlRedPage", enable_typechecks=False, + back_populates="redfams" ) + + @property + def articlesList(self): + """ + List of articles belonging to the redfam + """ + return self.__articlesList + + @articlesList.setter + def articlesList(self, articlesList): + # Make sure to always have full length for complete overwrites + while( len(articlesList) < 8 ): + articlesList.append(None) + self.__articlesList = ColumnList(articlesList) + + @property + def status( self ): + """ + Current fam status + """ + return self._status + + @status.setter + def status( self, status ): + if status: + self._status = MutableSet( status ) + else: + self._status = MutableSet() - def get_by_status( self, status ): + @property + def articlesStatus(self): """ - Generator witch fetches redFams with given status from DB + List of status strings/sets for the articles of the redfam """ + return self.__articlesStatus - cursor = type( self ).connection.cursor( mysqldb.DictCursor ) + @articlesStatus.setter + def articlesStatus(self, articlesStatus): + self.__articlesStatus = ColumnList(articlesStatus) - cursor.execute( - 'SELECT * FROM `{prefix}_redfams` WHERE `status` = LIKE %?%;'. - format( prefix=type( self ).db_table_prefix), ( status, ) ) - while True: - res = cursor.fetchmany( 1000 ) - if not res: - break - for row in res: - yield row +class MysqlRedPage( Mysql, Base ): + pageid = Column( Integer, unique=True, primary_key=True ) + revid = Column( Integer, unique=True, nullable=False ) + pagetitle = Column( String(255), nullable=False ) + __status = Column( 'status', MutableSet.as_mutable(Status(255)), + nullable=True ) - def get_by_status_and_ending( self, status, ending ): + redfams = relationship( + "MysqlRedFam", enable_typechecks=False, + back_populates="redpage", order_by=MysqlRedFam.famhash, + collection_class=attribute_mapped_collection("famhash") ) + + @property + def status( self ): """ - Generator witch fetches redFams with given status from DB + Current fam status """ + return self.__status + + @status.setter + def status( self, status ): + if status: + self.__status = MutableSet( status ) + else: + self.__status = MutableSet() - cursor = type( self ).connection.cursor( mysqldb.DictCursor ) - - cursor.execute( ( - 'SELECT * ' + - 'FROM `{prefix}_redfams` `F` ' + - 'INNER JOIN `{prefix}_redpages` `P` ' + - 'ON `F`.`status` = ? ' + - 'AND `F`.`ending` >= ? ' + - 'AND `F`.`redpageid` = `P`.`pageid`;').format( - prefix=type( self ).db_table_prefix), - ( status, ending ) ) - - while True: - res = cursor.fetchmany( 1000 ) - if not res: - break - for row in res: - yield row +Base.metadata.create_all(engine) class MysqlRedError(Exception): diff --git a/lib/redfam.py b/lib/redfam.py index 6e8b3d5..ca10e87 100644 --- a/lib/redfam.py +++ b/lib/redfam.py @@ -3,7 +3,7 @@ # # redfam.py # -# Copyright 2015 GOLDERWEB – Jonathan Golder +# Copyright 2017 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -35,10 +35,10 @@ import pywikibot # noqa from pywikibot.tools import deprecated # noqa import jogobot -from lib.mysqlred import MysqlRedFam +from lib.mysqlred import MysqlRedFam, text -class RedFam: +class RedFam( MysqlRedFam ): """ Basic class for RedFams, containing the basic data structure """ @@ -60,22 +60,16 @@ class RedFam: # Having pywikibot.Site() is a good idea most of the time self.site = pywikibot.Site() - # Database interface - self._mysql = MysqlRedFam( famhash ) - - # Initial attribute values - self._articlesList = articlesList - self._beginning = beginning - self._ending = ending - self._redpageid = redpageid - self._status = set() - self._status = self._parse_status(status) - self._famhash = famhash - self._heading = heading - - # Calculates the sha1 hash over self._articlesList to - # rediscover known redundance families - self.calc_famhash() + super().__init__( + articlesList=articlesList, + beginning=beginning, + ending=ending, + redpageid=redpageid, + famhash=famhash, + heading=heading, + status=status, + articlesStatus=None + ) def __repr__( self ): """ @@ -85,18 +79,20 @@ class RedFam: """ __repr = "RedFam( " + \ - "articlesList=" + repr( self._articlesList ) + \ - ", heading=" + repr( self._heading ) + \ - ", beginning=" + repr( self._beginning ) + \ - ", ending=" + repr( self._ending ) + \ - ", red_page_id=" + repr( self._redpageid ) + \ - ", status=" + repr( self._status ) + \ - ", fam_hash=" + repr( self._famhash ) + \ + "articlesList=" + repr( self.articlesList ) + \ + ", heading=" + repr( self.heading ) + \ + ", beginning=" + repr( self.beginning ) + \ + ", ending=" + repr( self.ending ) + \ + ", red_page_id=" + repr( self.redpageid ) + \ + ", status=" + repr( self.status ) + \ + ", fam_hash=" + repr( self.famhash ) + \ + ", articlesStatus=" + repr( self.articlesStatus ) + \ " )" return __repr - def calc_famhash( self ): + @classmethod + def calc_famhash(cls, articlesList ): """ Calculates the SHA-1 hash for the articlesList of redundance family. Since we don't need security SHA-1 is just fine. @@ -105,99 +101,21 @@ class RedFam: """ h = hashlib.sha1() - h.update( str( self._articlesList[:8] ).encode('utf-8') ) + # Since articlesList attr of RedFam will have always 8 Members we + # need to fill up smaller lists (longers will be cropped below). + while len( articlesList) < 8: + articlesList.append(None) - if self._famhash and h.hexdigest() != self._famhash: - raise RedFamHashError( self._famhash, h.hexdigest() ) + h.update( str( articlesList[:8] ).encode('utf-8') ) - elif self._famhash: - return - else: - self._famhash = h.hexdigest() - - def changed( self ): - """ - Checks wether anything has changed and maybe triggers db update - """ - - # On archived redfams do not delete possibly existing ending - if( not self._ending and "archived" in self._status and - self._mysql.data[ 'ending' ] ): - - self._ending = self._mysql.data[ 'ending' ] - - # Since status change means something has changed, update database - if( self._raw_status != self._mysql.data[ 'status' ] or - self._beginning != self._mysql.data[ 'beginning' ] or - self._ending != self._mysql.data[ 'ending' ] or - self._red_page_id != self._mysql.data[ 'redpageid' ] or - self._heading != self._mysql.data[ 'heading' ]): - - self._mysql.update_fam( self._redpageid, self._heading, - self._beginning, self._ending, - self._raw_status() ) + return h.hexdigest() @classmethod def flush_db_cache( cls ): """ Calls flush method of Mysql Interface class """ - MysqlRedFam.flush() - - def add_status(self, status): - """ - Adds a status specified by status, to status set - - @param status Statusstring to add - @type status str - """ - self._status.add(status) - - def remove_status(self, status, weak=True): - """ - Removes a status, specified by status from set. If weak is set to - False it will throw a KeyError when trying to remove a status not set. - - @param status Statusstring to add - @type status str - @param weak Change behavior on missing status - @type bool - """ - if weak: - self._status.discard(status) - else: - self._status.remove(status) - - def has_status(self, status): - """ - Returns True, if redfam has given status - - @param status Statusstring to check - @type status str - @returns True if status is present else False - """ - if status in self._status: - return True - else: - return False - - def _parse_status(self, raw_status ): - """ - Sets status based on comma separated list - - @param raw_status Commaseparated string of stati (from DB) - @type raw_status str - """ - self._status = set( raw_status.strip().split(",")) - - def _raw_status( self ): - """ - Returns status as commaseparated string (to save in DB) - - @returns Raw status string - @rtype str - """ - return ",".join( self._status ) + cls.session.commit() def article_add_status(self, status, index=None, title=None ): """ @@ -212,10 +130,10 @@ class RedFam: @type title str """ if title and not index: - index = self._articlesList.index( title ) + index = self.articlesList.index( title ) - if isinstance( index, int ) and index < len(self._articlesList): - self._article_status[index].add(status) + if isinstance( index, int ) and index < len(self.articlesList): + self.articlesStatus[index].add(status) else: raise IndexError( "No index given or wrong format!") @@ -236,13 +154,13 @@ class RedFam: @type bool """ if title and not index: - index = self._articlesList.index( title ) + index = self.articlesList.index( title ) - if isinstance( index, int ) and index < len(self._articlesList): + if isinstance( index, int ) and index < len(self.articlesList): if weak: - self._article_status[index].discard(status) + self.articlesStatus[index].discard(status) else: - self._article_status[index].remove(status) + self.articlesStatus[index].remove(status) else: raise IndexError( "No index given or wrong format!") @@ -259,56 +177,16 @@ class RedFam: @type title str """ if title and not index: - index = self._articlesList.index( title ) + index = self.articlesList.index( title ) - if isinstance( index, int ) and index < len(self._articlesList): - if status in self._article_status[index]: + if isinstance( index, int ) and index < len(self.articlesList): + if status in self.articlesStatus[index]: return True else: return False else: raise IndexError( "No index given or wrong format!") - def _article_parse_status(self, raw_status, index=None, title=None ): - """ - Sets status based on comma separated list to articles (identified by - title or index in articlesList) status set - - @param status Statusstring to set - @type status str - @param index Add to article with index in articlesList - @type index int - @param title Add to article with title in articlesList - @type title str - """ - if title and not index: - index = self._articlesList.index( title ) - - if isinstance( index, int ) and index < len(self._articlesList): - self._article_status[index] = set( raw_status.strip().split(",")) - else: - raise IndexError( "No index given or wrong format!") - - def _article_raw_status( self, index=None, title=None ): - """ - Returns status as commaseparated string (to save in DB) of article - (identified by title or index in articlesList) status set - - @param index Get from article with index in articlesList - @type index int - @param title Get from article with title in articlesList - @type title str - @returns Raw status string - @rtype str - """ - if title and not index: - index = self._articlesList.index( title ) - - if isinstance( index, int ) and index < len(self._articlesList): - return ",".join( self._article_status[index] ) - else: - raise IndexError( "No index given or wrong format!") - class RedFamParser( RedFam ): """ @@ -331,7 +209,7 @@ class RedFamParser( RedFam ): wurde gewünscht von:" __done_notice2 = "{{Erledigt|" - def __init__( self, heading, redpage, redpagearchive, + def __init__( self, articlesList, heading, redpage, redpagearchive, beginning, ending=None ): """ Creates a RedFam object based on data collected while parsing red_pages @@ -346,57 +224,50 @@ class RedFamParser( RedFam ): str strptime parseable string """ - # Set object attributes: - self._redpageid = redpage._pageid - self._redpagearchive = redpagearchive - self._famhash = None - - # Method self.add_beginning sets self._beginning directly - self.add_beginning( beginning ) + # Calculates the sha1 hash over self._articlesList to + # rediscover known redundance families + famhash = type(self).calc_famhash(articlesList) - # Method self.add_ending sets self._ending directly - if( ending ): - self.add_ending( ending ) - else: - # If no ending was provided set to None - self._ending = None + # Set object attributes: + self.redpage = redpage - self._status = set() + # Parse Timestamps + beginning = self.__datetime(beginning) + if ending: + ending = self.__datetime(ending) - # Parse the provided heading of redundance section - # to set self._articlesList - self.heading_parser( heading ) + super().__init__( articlesList, + beginning, + ending=ending, + redpageid=redpage.page._pageid, + famhash=famhash, + heading=heading ) - # Calculates the sha1 hash over self._articlesList to - # rediscover known redundance families + # Check status changes + self.check_status() - self.calc_famhash() + self.session.add(self) - # Open database connection, ask for data if existing, - # otherwise create entry - self.__handle_db() + def update( self, articlesList, heading, redpage, redpagearchive, + beginning, ending=None ): - # Check status changes - self.status() + self.articlesList = articlesList + self.heading = heading + self.redpage = redpage + self.redpageid = redpage.pageid - # Triggers db update if anything changed - self.changed() + self.add_beginning( beginning ) - def __handle_db( self ): - """ - Handles opening of db connection - """ + if ending: + self.add_ending( ending ) - # We need a connection to our mysqldb - self._mysql = MysqlRedFam( ) - self._mysql.get_fam( self._famhash ) + self._redpagearchive = redpagearchive - if not self._mysql.data: - self._mysql.add_fam( self._articlesList, self._heading, - self._redpageid, self._beginning, - self._ending ) + # Check status changes + self.check_status() - def heading_parser( self, heading ): + @classmethod + def heading_parser( cls, heading ): """ Parses given red_fam_heading string and saves articles list @@ -404,34 +275,13 @@ class RedFamParser( RedFam ): @type heading wikicode or mwparser-parseable """ - # Save heading as string - self._heading = str( heading ) - # Parse string heading with mwparse again everytime # In some cases the given wikicode is broken due to syntax errors # (Task FS#77) - heading = mwparser.parse( self._heading ) + heading = mwparser.parse( str( heading ) ) # Save destinations of wikilinks in headings - self._articlesList = [ str( link.title ) for link - in heading.ifilter_wikilinks() ] - - # Catch sections with more then 8 articles, print error - if len( self._articlesList ) > 8: - # For repression in output we need to know the fam hash - self.calc_famhash() - - jogobot.output( - ( "\03{{lightred}}" + - "Maximum number of articles in red_fam exceeded, " + - "maximum number is 8, {number:d} were given \n {repress}" - ).format( datetime=datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ), - repress=repr( self ) ), - "WARNING" ) - - # Only save the first 8 articles - self._articlesList = self._articlesList[:8] + return [ str( link.title ) for link in heading.ifilter_wikilinks() ] def add_beginning( self, beginning ): """ @@ -440,7 +290,7 @@ class RedFamParser( RedFam ): @param datetime datetime Beginning date """ - self._beginning = self.__datetime( beginning ) + self.beginning = self.__datetime( beginning ) def add_ending( self, ending ): """ @@ -449,7 +299,7 @@ class RedFamParser( RedFam ): @param datetime datetime Ending date """ - self._ending = self.__datetime( ending ) + self.ending = self.__datetime( ending ) def __datetime( self, timestamp ): """ @@ -473,7 +323,7 @@ class RedFamParser( RedFam ): type( self ).__timestamp_format ) return result - def status( self ): + def check_status( self ): """ Handles detection of correct status There are three possible stati: @@ -485,16 +335,16 @@ class RedFamParser( RedFam ): # No ending, discussion is running: # Sometimes archived discussions also have no detectable ending - if not self._ending and not self._redpagearchive: - self.add_status("open") + if not self.ending and not self.redpage.archive: + self.status.add("open") else: - self.remove_status("open") - if not self._redpagearchive: - self.add_status("done") + self.status.remove("open") + if not self.redpage.archive: + self.status.add("done") else: - self.remove_status("done") - self.remove_status("open") - self.add_status("archived") + self.status.remove("done") + self.status.remove("open") + self.status.add("archived") @classmethod def is_section_redfam_cb( cls, heading ): @@ -513,7 +363,7 @@ class RedFamParser( RedFam ): return False @classmethod - def parser( cls, text, page, isarchive=False ): + def parser( cls, text, redpage, isarchive=False ): """ Handles parsing of redfam section @@ -526,7 +376,7 @@ class RedFamParser( RedFam ): text = mwparser.parse( text ) # Extract heading text - heading = next( text.ifilter_headings() ).title + heading = next( text.ifilter_headings() ).title.strip() # Extract beginnig and maybe ending (beginning, ending) = RedFamParser.extract_dates( text, isarchive ) @@ -536,16 +386,37 @@ class RedFamParser( RedFam ): if not beginning: match = re.search( jogobot.config["redundances"]["reddiscs_onlyinclude_re"], - page.title() ) + redpage.page.title() ) if match: beginning = datetime.strptime( "01. {month} {year}".format( month=match.group(1), year=match.group(2)), "%d. %B %Y" ) + articlesList = RedFamParser.heading_parser( heading ) + famhash = RedFamParser.calc_famhash( articlesList ) + + # Check for existing objects in DB first in current redpage + redfam = redpage.redfams.get(famhash) - # Create the RedFam object - RedFamParser( heading, page, isarchive, beginning, ending ) + with RedFamParser.session.no_autoflush: + if not redfam: + # Otherwise in db table + redfam = RedFamParser.session.query(RedFamParser).filter( + RedFamParser.famhash == famhash ).one_or_none() + + if redfam: + # Existing redfams need to be updated + redfam.update( articlesList, str(heading), redpage, isarchive, + beginning, ending ) + + else: + # Create the RedFam object + redfam = RedFamParser( articlesList, str(heading), + redpage, isarchive, beginning, ending ) + + # Add redfam to redpage object + redpage.redfams.set( redfam ) @classmethod def extract_dates( cls, text, isarchive=False ): @@ -599,42 +470,16 @@ class RedFamWorker( RedFam ): Handles working with redundance families stored in database where discussion is finished """ - def __init__( self, mysql_data ): - - articlesList = [] - - for key in sorted( mysql_data.keys() ): - if 'article' in key and 'status' not in key and mysql_data[ key ]: - articlesList.append( mysql_data[ key ] ) - - # Preset article status list with empty sets for existing articles - self._article_status = [set() for x in range(0, len(articlesList))] - - super().__init__( articlesList, mysql_data[ 'beginning' ], - mysql_data[ 'ending' ], mysql_data[ 'redpageid' ], - mysql_data[ 'status' ], mysql_data[ 'famhash' ], - mysql_data[ 'heading' ] ) + def __init__( self ): - self._mysql.data = mysql_data - - # Set up article status - index = 0 - for article in self._articlesList: - raw_status = mysql_data[ "article" + str(index) + "_status" ] - if not raw_status: - raw_status = str() - self._article_parse_status( raw_status, index ) - index += 1 - - # Get related RedPage-Information - self.redpageid = mysql_data[ 'pageid' ] - self.redpagetitle = mysql_data[ 'pagetitle' ] + super().__init__() # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') - def article_generator(self, filter_existing=None, filter_redirects=None, + def article_generator(self, # noqa + filter_existing=None, filter_redirects=None, exclude_article_status=[], onlyinclude_article_status=[] ): """ @@ -653,8 +498,34 @@ class RedFamWorker( RedFam ): """ # Iterate over articles in redfam - for article in self._articlesList: - page = pywikibot.Page(pywikibot.Link(article), self.site) + for article in self.articlesList: + # Not all list elements contain articles + if not article: + break + + page = pywikibot.Page(pywikibot.Link(article), pywikibot.Site()) + + # Filter existing pages if requested with filter_existing=False + if page.exists(): + self.article_remove_status( "deleted", title=article ) + if filter_existing is False: + continue + # Filter non existing Pages if requested with filter_existing=True + else: + self.article_add_status( "deleted", title=article ) + if filter_existing: + continue + + # Filter redirects if requested with filter_redirects=True + if page.isRedirectPage(): + self.article_add_status( "redirect", title=article ) + if filter_redirects: + continue + # Filter noredirects if requested with filter_redirects=False + else: + self.article_remove_status("redirect", title=article ) + if filter_redirects is False: + continue # Exclude by article status for status in exclude_article_status: @@ -666,20 +537,6 @@ class RedFamWorker( RedFam ): if not self.article_has_status( status, title=article ): continue - # Filter non existing Pages if requested with filter_existing=True - if filter_existing and not page.exists(): - continue - # Filter existing pages if requested with filter_existing=False - elif filter_existing is False and page.exists(): - continue - - # Filter redirects if requested with filter_redirects=True - if filter_redirects and page.isRedirectPage(): - continue - # Filter noredirects if requested with filter_redirects=False - elif filter_redirects is False and not page.isRedirectPage(): - continue - # Yield filtered pages yield page @@ -687,24 +544,18 @@ class RedFamWorker( RedFam ): """ Sets status to 3 when worked on """ - for article in self._articlesList: + for article in self.articlesList: + if not article: + break + if self.article_has_status( "note_rej", title=article ): - self.add_status( "note_rej" ) + self.status.add( "note_rej" ) if self.article_has_status( "sav_err", title=article ): - self.add_status( "sav_err" ) - - if not self.has_status( "sav_err" ) and \ - not self.has_status( "note_rej" ): - self.add_status( "marked" ) + self.status.add( "sav_err" ) - self._mysql.data[ 'status' ] = self._raw_status() - index = 0 - for article in self._articlesList: - self._mysql.data[ "article" + str(index) + 'status' ] = \ - self._article_raw_status( index=index ) - index += 1 - - print( repr(self) ) + if not self.status.has( "sav_err" ) and \ + not self.status.has( "note_rej" ): + self.status.add( "marked" ) def get_disc_link( self ): """ @@ -715,7 +566,7 @@ class RedFamWorker( RedFam ): """ # We need to Replace Links with their linktext - anchor_code = mwparser.parse( self._mysql.data[ 'heading' ].strip() ) + anchor_code = mwparser.parse( self.heading.strip() ) for link in anchor_code.ifilter_wikilinks(): if link.text: text = link.text @@ -728,7 +579,7 @@ class RedFamWorker( RedFam ): anchor_code.replace( " ", "_" ) # We try it with out any more parsing as mw will do while parsing page - return ( self.redpagetitle + "#" + + return ( self.redpage.pagetitle + "#" + str(anchor_code).strip() ) def generate_disc_notice_template( self ): @@ -748,7 +599,9 @@ class RedFamWorker( RedFam ): param_cnt = 3 # Iterate over articles in redfam - for article in self._articlesList: + for article in self.articlesList: + if not article: + break # Make sure to only use 8 articles (max. param 10) if param_cnt > 10: break @@ -759,11 +612,11 @@ class RedFamWorker( RedFam ): param_cnt += 1 # Add begin - begin = self._mysql.data[ 'beginning' ].strftime( "%B %Y" ) + begin = self.beginning.strftime( "%B %Y" ) template.add( "Beginn", begin, True ) # Add end (if not same as begin) - end = self._mysql.data[ 'ending' ].strftime( "%B %Y" ) + end = self.ending.strftime( "%B %Y" ) if not end == begin: template.add( "Ende", end, True ) @@ -795,13 +648,14 @@ class RedFamWorker( RedFam ): Yield red_fams stored in db by given status which have an ending after given one """ - mysql = MysqlRedFam() - for fam in mysql.get_by_status_and_ending( status, ending ): - try: - yield cls( fam ) - except RedFamHashError: - print(fam) - raise + for redfam in RedFamWorker.session.query(RedFamWorker).filter( + # NOT WORKING WITH OBJECT NOTATION + # RedFamWorker._status.like('archived'), + # RedFamWorker._status.like("%{0:s}%".format(status)), + text("status LIKE '%archived%'"), + RedFamWorker.ending >= ending ): + + yield redfam class RedFamError( Exception ): diff --git a/lib/redpage.py b/lib/redpage.py index b4361b9..69f02b8 100644 --- a/lib/redpage.py +++ b/lib/redpage.py @@ -34,7 +34,7 @@ from lib.mysqlred import MysqlRedPage from lib.redfam import RedFamParser -class RedPage: +class RedPage( MysqlRedPage ): """ Class for handling redundance discussion pages and archives """ @@ -49,73 +49,55 @@ class RedPage: @type pageid int """ - self._status = set() - # Safe the pywikibot page object - self.page = page - self.pageid = pageid - self._archive = archive + if page: + self._page = page - self.__handle_db( ) - self.is_page_changed() + super().__init__( + pageid=self._page.pageid, + revid=self._page._revid, + pagetitle=self._page.title(), + status=None + ) - self._parsed = None + self.is_archive() - def __handle_db( self ): - """ - Handles opening of db connection - """ + self.session.add(self) - # We need a connection to our mysqldb - if self.page: - self.__mysql = MysqlRedPage( self.page._pageid ) - self.pageid = self.page._pageid - elif self.pageid: - self.__mysql = MysqlRedPage( self.pageid ) - self.page = pywikibot.Page( pywikibot.Site(), - self.__mysql.data['pagetitle'] ) - self.page.exists() - else: - raise ValueError( "Page NOR pagid provided!" ) + def update( self, page ): + self._page = page + self.revid = page._revid + self.pagetitle = page.title() + self.is_archive() - if not self.__mysql.data: - self.__mysql.add_page( self.page.title(), self.page._revid ) + @property + def page(self): + if not hasattr(self, "_page"): + self._page = pywikibot.Page( pywikibot.Site(), self.pagetitle ) - def is_page_changed( self ): - """ - Check wether the page was changed since last run - """ + return self._page - if( self.__mysql.data != { 'pageid': self.page._pageid, - 'revid': self.page._revid, - 'pagetitle': self.page.title(), - 'status': self.__mysql.data[ 'status' ] } ): - self._changed = True - else: - self._changed = False + @property + def archive(self): + self.is_archive() + return self.status.has("archive") def is_archive( self ): """ Detects wether current page is an archive of discussions """ - - if( self._archive or ( u"/Archiv" in self.page.title() ) or + if( ( u"/Archiv" in self.page.title() ) or ( "{{Archiv}}" in self.page.text ) or ( "{{Archiv|" in self.page.text ) ): - - return True + self.status.add("archive") else: - return False + self.status.discard("archive") def is_parsing_needed( self ): """ Decides wether current RedPage needs to be parsed or not """ - - if( self._changed or self.__mysql.data[ 'status' ] == "" ): - return True - else: - return False + return self.changedp() or not self.status.has("parsed") def parse( self ): """ @@ -140,83 +122,12 @@ class RedPage: yield fam else: + self.status.add("parsed") self._parsed = True - self.__update_db() - - def __update_db( self ): - """ - Updates the page meta data in mysql db - """ - if( self._parsed or not self._changed ): - self.add_status( "open" ) - - if( self.is_archive() ): - self.remove_status( "open" ) - self.add_status( "archived" ) - else: - self._status = set() - - self.__mysql.update_page( self.page._revid, self.page.title(), - self._raw_status() ) @classmethod def flush_db_cache( cls ): """ Calls flush method of Mysql Interface class """ - MysqlRedPage.flush() - - def add_status(self, status): - """ - Adds a status specified by status, to status set - - @param status Statusstring to add - @type status str - """ - self._status.add(status) - - def remove_status(self, status, weak=True): - """ - Removes a status, specified by status from set. If weak is set to - False it will throw a KeyError when trying to remove a status not set. - - @param status Statusstring to add - @type status str - @param weak Change behavior on missing status - @type bool - """ - if weak: - self._status.discard(status) - else: - self._status.remove(status) - - def has_status(self, status): - """ - Returns True, if redfam has given status - - @param status Statusstring to check - @type status str - @returns True if status is present else False - """ - if status in self._status: - return True - else: - return False - - def _parse_status(self, raw_status ): - """ - Sets status based on comma separated list - - @param raw_status Commaseparated string of stati (from DB) - @type raw_status str - """ - self._status = set( raw_status.strip().split(",")) - - def _raw_status( self ): - """ - Returns status as commaseparated string (to save in DB) - - @returns Raw status string - @rtype str - """ - return ",".join( self._status ) + cls.session.commit()