#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # redfam.py # # Copyright 2017 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # """ Provides classes for working with RedFams """ import hashlib import locale import re from datetime import datetime import mwparserfromhell as mwparser # noqa import pywikibot # noqa from pywikibot.tools import deprecated # noqa import jogobot #~ from lib.mysqlred import Column, Integer, String, Text, DateTime, ForeignKey, ColumnList, Status from lib.mysqlred import MysqlRedFam, MutableSet, ColumnList #, Mysql, Base, relationship, composite, class RedFam( MysqlRedFam ): """ Basic class for RedFams, containing the basic data structure """ def __init__( self, articlesList, beginning, ending=None, redpageid=None, status=MutableSet(), famhash=None, heading=None ): """ Generates a new RedFam object @param articlesList list List of articles @param beginning datetime Beginning date @param ending datetime Ending date @param red_page_id int MW pageid of containing RedPage @param status str Status of RedFam @param fam_hash str SHA1 hash of articlesList @param heading str Original heading of RedFam (Link) """ # Having pywikibot.Site() is a good idea most of the time self.site = pywikibot.Site() # Database interface #self._mysql = MysqlRedFam( famhash ) # Initial attribute values #~ self.articlesList = articlesList #~ self.beginning = beginning #~ self.ending = ending #~ self.redpageid = redpageid #~ # self._status = set() #~ # self._status = self._parse_status(status) #~ self.famhash = famhash #~ self.heading = heading #self.status = status #articlesStatus = ColumnList([ MutableSet() for x in range(0,8) ]) #~ # Calculates the sha1 hash over self._articlesList to #~ # rediscover known redundance families #~ self.calc_famhash() #~ if not status: #~ status = MutableSet() super().__init__( articlesList=articlesList, beginning=beginning, ending=ending, redpageid=redpageid, famhash=famhash, heading=heading, status=status, articlesStatus=None ) #super().__init__() def __repr__( self ): """ Returns repression str of RedFam object @returns str repr() string """ __repr = "RedFam( " + \ "articlesList=" + repr( self.articlesList ) + \ ", heading=" + repr( self.heading ) + \ ", beginning=" + repr( self.beginning ) + \ ", ending=" + repr( self.ending ) + \ ", red_page_id=" + repr( self.redpageid ) + \ ", status=" + repr( self.status ) + \ ", fam_hash=" + repr( self.famhash ) + \ " )" return __repr @classmethod def calc_famhash(cls, articlesList ): h = hashlib.sha1() # Since articlesList attr of RedFam will have always 8 Members we # need to fill up smaller lists (longers will be cropped below). while len( articlesList) < 8: articlesList.append(None) h.update( str( articlesList[:8] ).encode('utf-8') ) return h.hexdigest() def c_famhash( self ): """ Calculates the SHA-1 hash for the articlesList of redundance family. Since we don't need security SHA-1 is just fine. @returns str String with the hexadecimal hash digest """ print( type( self ) ) if self.famhash and type(self).calc_famhash(self.articlesList) != self.famhash: raise RedFamHashError( self.famhash, h.hexdigest() ) elif self.famhash: return else: self.famhash = type(self).calc_famhash(self.articlesList) #~ def changed( self ): #~ """ #~ Checks wether anything has changed and maybe triggers db update #~ """ #~ # On archived redfams do not delete possibly existing ending #~ if( not self.ending and "archived" in self._status and #~ self._mysql.data[ 'ending' ] ): #~ self._ending = self._mysql.data[ 'ending' ] #~ # Since status change means something has changed, update database #~ if( self._raw_status != self._mysql.data[ 'status' ] or #~ self._beginning != self._mysql.data[ 'beginning' ] or #~ self._ending != self._mysql.data[ 'ending' ] or #~ self._red_page_id != self._mysql.data[ 'redpageid' ] or #~ self._heading != self._mysql.data[ 'heading' ]): #~ self._mysql.update_fam( self._redpageid, self._heading, #~ self._beginning, self._ending, #~ self._raw_status() ) @classmethod def flush_db_cache( cls ): """ Calls flush method of Mysql Interface class """ cls.session.commit() #~ MysqlRedFam.flush() def add_status(self, status): """ Adds a status specified by status, to status set @param status Statusstring to add @type status str """ self.status.add(status) def remove_status(self, status, weak=True): """ Removes a status, specified by status from set. If weak is set to False it will throw a KeyError when trying to remove a status not set. @param status Statusstring to add @type status str @param weak Change behavior on missing status @type bool """ if weak: self.status.discard(status) else: self.status.remove(status) def has_status(self, status): """ Returns True, if redfam has given status @param status Statusstring to check @type status str @returns True if status is present else False """ if status in self.status: return True else: return False #~ def _parse_status(self, raw_status ): #~ """ #~ Sets status based on comma separated list #~ @param raw_status Commaseparated string of stati (from DB) #~ @type raw_status str #~ """ #~ self._status = set( raw_status.strip().split(",")) #~ def _raw_status( self ): #~ """ #~ Returns status as commaseparated string (to save in DB) #~ @returns Raw status string #~ @rtype str #~ """ #~ return ",".join( self._status ) def article_add_status(self, status, index=None, title=None ): """ Adds a status specified by status, to article (identified by title or index in articlesList) status set @param status Statusstring to add @type status str @param index Add to article with index in articlesList @type index int @param title Add to article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): self._article_status[index].add(status) else: raise IndexError( "No index given or wrong format!") def article_remove_status(self, status, index=None, title=None, weak=True): """ Removes a status specified by status, from article (identified by title or index in articlesList) status set If weak is set to False it will throw a KeyError when trying to remove a status not set. @param status Statusstring to add @type status str @param index Remove from article with index in articlesList @type index int @param title Remove from article with title in articlesList @type title str @param weak Change behavior on missing status @type bool """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): if weak: self._article_status[index].discard(status) else: self._article_status[index].remove(status) else: raise IndexError( "No index given or wrong format!") def article_has_status(self, status, index=None, title=None ): """ Adds a status specified by status, to articles (identified by title or index in articlesList) status set @param status Statusstring to add @type status str @param index Check article with index in articlesList @type index int @param title Check article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): if status in self._article_status[index]: return True else: return False else: raise IndexError( "No index given or wrong format!") def _article_parse_status(self, raw_status, index=None, title=None ): """ Sets status based on comma separated list to articles (identified by title or index in articlesList) status set @param status Statusstring to set @type status str @param index Add to article with index in articlesList @type index int @param title Add to article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): self._article_status[index] = set( raw_status.strip().split(",")) else: raise IndexError( "No index given or wrong format!") def _article_raw_status( self, index=None, title=None ): """ Returns status as commaseparated string (to save in DB) of article (identified by title or index in articlesList) status set @param index Get from article with index in articlesList @type index int @param title Get from article with title in articlesList @type title str @returns Raw status string @rtype str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): return ",".join( self._article_status[index] ) else: raise IndexError( "No index given or wrong format!") class RedFamParser( RedFam ): """ Provides an interface to RedFam for adding/updating redundance families while parsig redundance pages """ # Define the timestamp format __timestamp_format = jogobot.config['redundances']['timestamp_format'] # Define section heading re.pattern __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" ) # Define timestamp re.pattern __timestamp_pat = re.compile( jogobot.config['redundances'] ['timestamp_regex'] ) # Textpattern for recognisation of done-notices __done_notice = ":Archivierung dieses Abschnittes \ wurde gewünscht von:" __done_notice2 = "{{Erledigt|" def __init__( self, articlesList, heading, redpage, redpagearchive, beginning, ending=None ): """ Creates a RedFam object based on data collected while parsing red_pages combined with possibly former known data from db @param redfam_heading str Wikitext heading of section @param redpage page Pywikibot.page object @param redpagearchive bool Is red_page an archive @param beginning datetime Timestamp of beginning str as strptime parseable string @param ending datetime Timestamp of ending str strptime parseable string """ # Parse the provided heading of redundance section # to set self._articlesList #~ self.heading = str(heading) #~ self.articlesList = articlesList #~ # Catch sections with more then 8 articles, print error #~ if len( self.articlesList ) > 8: #~ # For repression in output we need to know the fam hash #~ self.calc_famhash() #~ jogobot.output( #~ ( "\03{{lightred}}" + #~ "Maximum number of articles in red_fam exceeded, " + #~ "maximum number is 8, {number:d} were given \n {repress}" #~ ).format( datetime=datetime.now().strftime( #~ "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ), #~ repress=repr( self ) ), #~ "WARNING" ) #~ # Only save the first 8 articles #~ # self.articlesList = self.articlesList[:8] # Calculates the sha1 hash over self._articlesList to # rediscover known redundance families famhash = type(self).calc_famhash(articlesList) #~ obj = self.session.query(RedFamParser).filter(RedFamParser.famhash == self.famhash ).one_or_none() #~ if obj: #~ self = obj # Set object attributes: #~ self.redpageid = redpage._pageid self._redpagearchive = redpagearchive # self.famhash = None # Method self.add_beginning sets self._beginning directly #~ self.add_beginning( beginning ) #~ # Method self.add_ending sets self._ending directly #~ if( ending ): #~ self.add_ending( ending ) #~ else: #~ # If no ending was provided set to None #~ self.ending = None #~ self.status = MutableSet() beginning = self.__datetime(beginning) if ending: ending = self.__datetime(ending) super().__init__( articlesList, beginning, ending=ending, redpageid=redpage._pageid, famhash=famhash, heading=heading ) # Check status changes self.check_status() self.session.add(self) # Open database connection, ask for data if existing, # otherwise create entry # self.__handle_db() # Triggers db update if anything changed # self.changed() #~ def __handle_db( self ): #~ """ #~ Handles opening of db connection #~ """ #~ # We need a connection to our mysqldb #~ self._mysql = MysqlRedFam( ) #~ self._mysql.get_fam( self._famhash ) #~ if not self._mysql.data: #~ self._mysql.add_fam( self._articlesList, self._heading, #~ self._redpageid, self._beginning, #~ self._ending ) def update( self, articlesList, heading, redpage, redpagearchive, beginning, ending=None): self.articlesList = articlesList; self.heading = heading; self.redpage = redpage; self.redpageid = redpage.pageid; self.add_beginning( beginning ) if( ending ): self.add_ending( ending ) self._redpagearchive = redpagearchive # Check status changes self.check_status() @classmethod def heading_parser( cls, heading ): """ Parses given red_fam_heading string and saves articles list @param heading Heading of RedFam-Section @type heading wikicode or mwparser-parseable """ # Parse string heading with mwparse again everytime # In some cases the given wikicode is broken due to syntax errors # (Task FS#77) heading = mwparser.parse( str( heading ) ) # Save destinations of wikilinks in headings return [ str( link.title ) for link in heading.ifilter_wikilinks() ] def add_beginning( self, beginning ): """ Adds the beginning date of a redundance diskussion to the object @param datetime datetime Beginning date """ self.beginning = self.__datetime( beginning ) def add_ending( self, ending ): """ Adds the ending date of a redundance diskussion to the object. @param datetime datetime Ending date """ self.ending = self.__datetime( ending ) def __datetime( self, timestamp ): """ Decides wether given timestamp is a parseable string or a datetime object and returns a datetime object in both cases @param datetime timestamp Datetime object str timestamp Parseable string with timestamp @returns datetime Datetime object """ # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') if( isinstance( timestamp, datetime ) ): return timestamp else: result = datetime.strptime( timestamp, type( self ).__timestamp_format ) return result def check_status( self ): """ Handles detection of correct status There are three possible stati: - 0 Discussion running --> no ending, page is not an archive - 1 Discussion over --> ending present, page is not an archive - 2 Discussion archived --> ending (normaly) present, page is archive - 3 and greater status was set by worker script, do not change it """ # No ending, discussion is running: # Sometimes archived discussions also have no detectable ending if not self.ending and not self._redpagearchive: self.add_status("open") else: self.remove_status("open") if not self._redpagearchive: self.add_status("done") else: self.remove_status("done") self.remove_status("open") self.add_status("archived") @classmethod def is_section_redfam_cb( cls, heading ): """ Used as callback for wikicode.get_sections in redpage.parse to select sections which are redfams """ # Because of strange behavior in some cases, parse heading again # (Task FS#77) heading = mwparser.parse( str( heading ) ) # Make sure we have min. two wikilinks in heading to assume a redfam if len( heading.filter_wikilinks() ) >= 2: return True else: return False @classmethod def parser( cls, text, redpage, isarchive=False ): """ Handles parsing of redfam section @param text Text of RedFam-Section @type text wikicode or mwparser-parseable """ # Parse heading with mwparse if needed if not isinstance( text, mwparser.wikicode.Wikicode ): text = mwparser.parse( text ) # Extract heading text heading = next( text.ifilter_headings() ).title # Extract beginnig and maybe ending (beginning, ending) = RedFamParser.extract_dates( text, isarchive ) # Missing beginning (Task: FS#76) # Use first day of month of reddisc if not beginning: match = re.search( jogobot.config["redundances"]["reddiscs_onlyinclude_re"], redpage.page.title() ) if match: beginning = datetime.strptime( "01. {month} {year}".format( month=match.group(1), year=match.group(2)), "%d. %B %Y" ) articlesList = RedFamParser.heading_parser( heading ) famhash = RedFamParser.calc_famhash( articlesList ) # Check for existing objects in DB first in current redpage redfam = redpage.redfams.get(famhash) with RedFamParser.session.no_autoflush: if not redfam: # Otherwise in db table redfam = RedFamParser.session.query(RedFamParser).filter( RedFamParser.famhash == famhash ).one_or_none() if redfam: # Existing redfams need to be updated redfam.update( articlesList, str(heading), redpage, isarchive, beginning, ending ) else: # Create the RedFam object redfam = RedFamParser( articlesList, str(heading).strip(), redpage.page, isarchive, beginning, ending ) return redfam @classmethod def extract_dates( cls, text, isarchive=False ): """ Returns tuple of the first and maybe last timestamp of a section. Last timestamp is only returned if there is a done notice or param *isarchiv* is set to 'True' @param text Text to search in @type line Any Type castable to str @param isarchive If true skip searching done notice (on archivepages) @type isarchive bool @returns Timestamps, otherwise None @returntype tuple of strs """ # Match all timestamps matches = cls.__timestamp_pat.findall( str( text ) ) if matches: # First one is beginning # Since some timestamps are broken we need to reconstruct them # by regex match groups beginning = ( matches[0][0] + ", " + matches[0][1] + ". " + matches[0][2] + ". " + matches[0][3] ) # Last one maybe is ending # Done notice format 1 # Done notice format 2 # Or on archivepages if ( cls.__done_notice in text or cls.__done_notice2 in text or isarchive ): ending = ( matches[-1][0] + ", " + matches[-1][1] + ". " + matches[-1][2] + ". " + matches[-1][3] ) else: ending = None # Missing dates (Task: FS#76) else: beginning = None ending = None return (beginning, ending) class RedFamWorker( RedFam ): """ Handles working with redundance families stored in database where discussion is finished """ def __init__( self, mysql_data ): articlesList = [] for key in sorted( mysql_data.keys() ): if 'article' in key and 'status' not in key and mysql_data[ key ]: articlesList.append( mysql_data[ key ] ) # Preset article status list with empty sets for existing articles self._article_status = [set() for x in range(0, len(articlesList))] super().__init__( articlesList, mysql_data[ 'beginning' ], mysql_data[ 'ending' ], mysql_data[ 'redpageid' ], mysql_data[ 'status' ], mysql_data[ 'famhash' ], mysql_data[ 'heading' ] ) # #~ self._mysql.data = mysql_data #~ # Set up article status #~ index = 0 #~ for article in self.articlesList: #~ raw_status = mysql_data[ "article" + str(index) + "_status" ] #~ if not raw_status: #~ raw_status = str() #~ self._article_parse_status( raw_status, index ) #~ index += 1 # Get related RedPage-Information self.redpageid = mysql_data[ 'pageid' ] self.redpagetitle = mysql_data[ 'pagetitle' ] # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') def article_generator(self, filter_existing=None, filter_redirects=None, exclude_article_status=[], onlyinclude_article_status=[] ): """ Yields pywikibot pageobjects for articles belonging to this redfams in a generator self. @param filter_existing Set to True to only get existing pages set to False to only get nonexisting pages unset/None results in not filtering @type filter_existing bool/None @param filter_redirects Set to True to get only noredirectpages, set to False to get only redirectpages, unset/None results in not filtering @type filter_redirects bool/None """ # Iterate over articles in redfam for article in self._articlesList: page = pywikibot.Page(pywikibot.Link(article), self.site) # Exclude by article status for status in exclude_article_status: if self.article_has_status( status, title=article ): continue # Only include by article status for status in onlyinclude_article_status: if not self.article_has_status( status, title=article ): continue # Filter non existing Pages if requested with filter_existing=True if filter_existing and not page.exists(): continue # Filter existing pages if requested with filter_existing=False elif filter_existing is False and page.exists(): continue # Filter redirects if requested with filter_redirects=True if filter_redirects and page.isRedirectPage(): continue # Filter noredirects if requested with filter_redirects=False elif filter_redirects is False and not page.isRedirectPage(): continue # Yield filtered pages yield page def update_status( self ): """ Sets status to 3 when worked on """ for article in self._articlesList: if self.article_has_status( "note_rej", title=article ): self.add_status( "note_rej" ) if self.article_has_status( "sav_err", title=article ): self.add_status( "sav_err" ) if not self.has_status( "sav_err" ) and \ not self.has_status( "note_rej" ): self.add_status( "marked" ) self._mysql.data[ 'status' ] = self._raw_status() index = 0 for article in self._articlesList: self._mysql.data[ "article" + str(index) + 'status' ] = \ self._article_raw_status( index=index ) index += 1 print( repr(self) ) def get_disc_link( self ): """ Constructs and returns the link to Redundancy discussion @returns Link to diskussion @rtype str """ # We need to Replace Links with their linktext anchor_code = mwparser.parse( self._mysql.data[ 'heading' ].strip() ) for link in anchor_code.ifilter_wikilinks(): if link.text: text = link.text else: text = link.title anchor_code.replace( link, text ) # Whitespace is replaced with underscores anchor_code.replace( " ", "_" ) # We try it with out any more parsing as mw will do while parsing page return ( self.redpagetitle + "#" + str(anchor_code).strip() ) def generate_disc_notice_template( self ): """ Generates notice template to add on discussion Pages of Articles when redundancy discussion is finished @return Notice template to add on article disc @rtype wikicode-node """ # Generate template boilerplate template = mwparser.nodes.template.Template( jogobot.config['redundances']['disc_notice_template_name']) # Index of first article's param param_cnt = 3 # Iterate over articles in redfam for article in self._articlesList: # Make sure to only use 8 articles (max. param 10) if param_cnt > 10: break # Add param for article template.add( param_cnt, article, True ) param_cnt += 1 # Add begin begin = self._mysql.data[ 'beginning' ].strftime( "%B %Y" ) template.add( "Beginn", begin, True ) # Add end (if not same as begin) end = self._mysql.data[ 'ending' ].strftime( "%B %Y" ) if not end == begin: template.add( "Ende", end, True ) # Add link to related reddisc template.add( "Diskussion", self.get_disc_link(), True ) # Add signature and timestamp # Not used atm # template.add( 1, "-- ~~~~", True ) return template @classmethod def list_by_status( cls, status ): """ Lists red_fams stored in db by given status """ mysql = MysqlRedFam() for fam in mysql.get_by_status( status ): try: print( cls( fam ) ) except RedFamHashError: print(fam) raise @classmethod def gen_by_status_and_ending( cls, status, ending ): """ Yield red_fams stored in db by given status which have an ending after given one """ mysql = MysqlRedFam() for fam in mysql.get_by_status_and_ending( status, ending ): try: yield cls( fam ) except RedFamHashError: print(fam) raise class RedFamError( Exception ): """ Base class for all Errors of RedFam-Module """ def __init__( self, message=None ): """ Handles Instantiation of RedFamError's """ if not message: self.message = "An Error occured while executing a RedFam action" else: self.message = message def __str__( self ): """ Output of error message """ return self.message class RedFamHashError( RedFamError ): """ Raised when given RedFamHash does not match with calculated """ def __init__( self, givenHash, calculatedHash ): message = "Given fam_hash ('{given}') does not match with \ calculated ('{calc}'".format( given=givenHash, calc=calculatedHash ) super().__init__( message ) class RedFamHeadingError ( RedFamError ): """ Raised when given RedFamHeading does not match __sectionhead_pat Regex """ def __init__( self, heading ): message = "Error while trying to parse section heading. Given heading \ '{heading}' does not match RegEx".format( heading=heading ) super().__init__( message )