#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # redfam.py # # Copyright 2015 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # """ Provides classes for working with RedFams """ import hashlib import locale import re from datetime import datetime import mwparserfromhell as mwparser # noqa import pywikibot # noqa from pywikibot.tools import deprecated # noqa import jogobot from lib.mysqlred import MysqlRedFam class RedFam: """ Basic class for RedFams, containing the basic data structure """ def __init__( self, articlesList, beginning, ending=None, redpageid=None, status=None, famhash=None, heading=None ): """ Generates a new RedFam object @param articlesList list List of articles @param beginning datetime Beginning date @param ending datetime Ending date @param red_page_id int MW pageid of containing RedPage @param status str Status of RedFam @param fam_hash str SHA1 hash of articlesList @param heading str Original heading of RedFam (Link) """ # Having pywikibot.Site() is a good idea most of the time self.site = pywikibot.Site() # Database interface self._mysql = MysqlRedFam( famhash ) # Initial attribute values self._articlesList = articlesList self._beginning = beginning self._ending = ending self._redpageid = redpageid self._status = self._parse_status(status) self._famhash = famhash self._heading = heading # Calculates the sha1 hash over self._articlesList to # rediscover known redundance families self.calc_famhash() def __repr__( self ): """ Returns repression str of RedFam object @returns str repr() string """ __repr = "RedFam( " + \ "articlesList=" + repr( self._articlesList ) + \ ", heading=" + repr( self._heading ) + \ ", beginning=" + repr( self._beginning ) + \ ", ending=" + repr( self._ending ) + \ ", red_page_id=" + repr( self._redpageid ) + \ ", status=" + repr( self._status ) + \ ", fam_hash=" + repr( self._famhash ) + \ " )" return __repr def calc_famhash( self ): """ Calculates the SHA-1 hash for the articlesList of redundance family. Since we don't need security SHA-1 is just fine. @returns str String with the hexadecimal hash digest """ h = hashlib.sha1() h.update( str( self._articlesList[:8] ).encode('utf-8') ) if self._famhash and h.hexdigest() != self._famhash: raise RedFamHashError( self._famhash, h.hexdigest() ) elif self._famhash: return else: self._famhash = h.hexdigest() def changed( self ): """ Checks wether anything has changed and maybe triggers db update """ # On archived redfams do not delete possibly existing ending if( not self._ending and "archived" in self._status and self._mysql.data[ 'ending' ] ): self._ending = self._mysql.data[ 'ending' ] # Since status change means something has changed, update database if( self._raw_status != self._mysql.data[ 'status' ] or self._beginning != self._mysql.data[ 'beginning' ] or self._ending != self._mysql.data[ 'ending' ] or self._red_page_id != self._mysql.data[ 'redpageid' ] or self._heading != self._mysql.data[ 'heading' ]): self._mysql.update_fam( self._redpageid, self._heading, self._beginning, self._ending, self._raw_status() ) @classmethod def flush_db_cache( cls ): """ Calls flush method of Mysql Interface class """ MysqlRedFam.flush() def add_status(self, status): """ Adds a status specified by status, to status set @param status Statusstring to add @type status str """ self._status.add(status) def remove_status(self, status, weak=True): """ Removes a status, specified by status from set. If weak is set to False it will throw a KeyError when trying to remove a status not set. @param status Statusstring to add @type status str @param weak Change behavior on missing status @type bool """ if weak: self._status.discard(status) else: self._status.remove(status) def has_status(self, status): """ Returns True, if redfam has given status @param status Statusstring to check @type status str @returns True if status is present else False """ if status in self._status: return True else: return False def _parse_status(self, raw_status ): """ Sets status based on comma separated list @param raw_status Commaseparated string of stati (from DB) @type raw_status str """ self._status = set( raw_status.strip().split(",")) def _raw_status( self ): """ Returns status as commaseparated string (to save in DB) @returns Raw status string @rtype str """ return ",".join( self._status ) def article_add_status(self, status, index=None, title=None ): """ Adds a status specified by status, to article (identified by title or index in articlesList) status set @param status Statusstring to add @type status str @param index Add to article with index in articlesList @type index int @param title Add to article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): self._article_status[index].add(status) else: raise IndexError( "No index given or wrong format!") def article_remove_status(self, status, index=None, title=None, weak=True): """ Removes a status specified by status, from article (identified by title or index in articlesList) status set If weak is set to False it will throw a KeyError when trying to remove a status not set. @param status Statusstring to add @type status str @param index Remove from article with index in articlesList @type index int @param title Remove from article with title in articlesList @type title str @param weak Change behavior on missing status @type bool """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): if weak: self._article_status[index].discard(status) else: self._article_status[index].remove(status) else: raise IndexError( "No index given or wrong format!") def article_has_status(self, status, index=None, title=None ): """ Adds a status specified by status, to articles (identified by title or index in articlesList) status set @param status Statusstring to add @type status str @param index Check article with index in articlesList @type index int @param title Check article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): if status in self._article_status[index]: return True else: return False else: raise IndexError( "No index given or wrong format!") def _article_parse_status(self, raw_status, index=None, title=None ): """ Sets status based on comma separated list to articles (identified by title or index in articlesList) status set @param status Statusstring to set @type status str @param index Add to article with index in articlesList @type index int @param title Add to article with title in articlesList @type title str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): self._article_status[index] = set( raw_status.strip().split(",")) else: raise IndexError( "No index given or wrong format!") def _article_raw_status( self, index=None, title=None ): """ Returns status as commaseparated string (to save in DB) of article (identified by title or index in articlesList) status set @param index Get from article with index in articlesList @type index int @param title Get from article with title in articlesList @type title str @returns Raw status string @rtype str """ if title and not index: index = self._articlesList.index( title ) if isinstance( index, int ) and index < len(self._articlesList): return ",".join( self._article_status[index] ) else: raise IndexError( "No index given or wrong format!") class RedFamParser( RedFam ): """ Provides an interface to RedFam for adding/updating redundance families while parsig redundance pages """ # Define the timestamp format __timestamp_format = jogobot.config['redundances']['timestamp_format'] # Define section heading re.pattern __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" ) # Define timestamp re.pattern __timestamp_pat = re.compile( jogobot.config['redundances'] ['timestamp_regex'] ) # Textpattern for recognisation of done-notices __done_notice = ":Archivierung dieses Abschnittes \ wurde gewünscht von:" __done_notice2 = "{{Erledigt|" def __init__( self, heading, redpage, redpagearchive, beginning, ending=None ): """ Creates a RedFam object based on data collected while parsing red_pages combined with possibly former known data from db @param redfam_heading str Wikitext heading of section @param redpage page Pywikibot.page object @param redpagearchive bool Is red_page an archive @param beginning datetime Timestamp of beginning str as strptime parseable string @param ending datetime Timestamp of ending str strptime parseable string """ # Set object attributes: self._redpageid = redpage._pageid self._redpagearchive = redpagearchive self._famhash = None # Method self.add_beginning sets self._beginning directly self.add_beginning( beginning ) # Method self.add_ending sets self._ending directly if( ending ): self.add_ending( ending ) else: # If no ending was provided set to None self._ending = None self._status = set() # Parse the provided heading of redundance section # to set self._articlesList self.heading_parser( heading ) # Calculates the sha1 hash over self._articlesList to # rediscover known redundance families self.calc_famhash() # Open database connection, ask for data if existing, # otherwise create entry self.__handle_db() # Check status changes self.status() # Triggers db update if anything changed self.changed() def __handle_db( self ): """ Handles opening of db connection """ # We need a connection to our mysqldb self._mysql = MysqlRedFam( ) self._mysql.get_fam( self._famhash ) if not self._mysql.data: self._mysql.add_fam( self._articlesList, self._heading, self._redpageid, self._beginning, self._ending ) def heading_parser( self, heading ): """ Parses given red_fam_heading string and saves articles list @param heading Heading of RedFam-Section @type heading wikicode or mwparser-parseable """ # Save heading as string self._heading = str( heading ) # Parse string heading with mwparse again everytime # In some cases the given wikicode is broken due to syntax errors # (Task FS#77) heading = mwparser.parse( self._heading ) # Save destinations of wikilinks in headings self._articlesList = [ str( link.title ) for link in heading.ifilter_wikilinks() ] # Catch sections with more then 8 articles, print error if len( self._articlesList ) > 8: # For repression in output we need to know the fam hash self.calc_famhash() jogobot.output( ( "\03{{lightred}}" + "Maximum number of articles in red_fam exceeded, " + "maximum number is 8, {number:d} were given \n {repress}" ).format( datetime=datetime.now().strftime( "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ), repress=repr( self ) ), "WARNING" ) # Only save the first 8 articles self._articlesList = self._articlesList[:8] def add_beginning( self, beginning ): """ Adds the beginning date of a redundance diskussion to the object @param datetime datetime Beginning date """ self._beginning = self.__datetime( beginning ) def add_ending( self, ending ): """ Adds the ending date of a redundance diskussion to the object. @param datetime datetime Ending date """ self._ending = self.__datetime( ending ) def __datetime( self, timestamp ): """ Decides wether given timestamp is a parseable string or a datetime object and returns a datetime object in both cases @param datetime timestamp Datetime object str timestamp Parseable string with timestamp @returns datetime Datetime object """ # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') if( isinstance( timestamp, datetime ) ): return timestamp else: result = datetime.strptime( timestamp, type( self ).__timestamp_format ) return result def status( self ): """ Handles detection of correct status There are three possible stati: - 0 Discussion running --> no ending, page is not an archive - 1 Discussion over --> ending present, page is not an archive - 2 Discussion archived --> ending (normaly) present, page is archive - 3 and greater status was set by worker script, do not change it """ # No ending, discussion is running: # Sometimes archived discussions also have no detectable ending if not self._ending and not self._redpagearchive: self.add_status("open") else: self.remove_status("open") if not self._redpagearchive: self.add_status("done") else: self.remove_status("done") self.remove_status("open") self.add_status("archived") @classmethod def is_section_redfam_cb( cls, heading ): """ Used as callback for wikicode.get_sections in redpage.parse to select sections which are redfams """ # Because of strange behavior in some cases, parse heading again # (Task FS#77) heading = mwparser.parse( str( heading ) ) # Make sure we have min. two wikilinks in heading to assume a redfam if len( heading.filter_wikilinks() ) >= 2: return True else: return False @classmethod def parser( cls, text, page, isarchive=False ): """ Handles parsing of redfam section @param text Text of RedFam-Section @type text wikicode or mwparser-parseable """ # Parse heading with mwparse if needed if not isinstance( text, mwparser.wikicode.Wikicode ): text = mwparser.parse( text ) # Extract heading text heading = next( text.ifilter_headings() ).title # Extract beginnig and maybe ending (beginning, ending) = RedFamParser.extract_dates( text, isarchive ) # Missing beginning (Task: FS#76) # Use first day of month of reddisc if not beginning: match = re.search( jogobot.config["redundances"]["reddiscs_onlyinclude_re"], page.title() ) if match: beginning = datetime.strptime( "01. {month} {year}".format( month=match.group(1), year=match.group(2)), "%d. %B %Y" ) # Create the RedFam object RedFamParser( heading, page, isarchive, beginning, ending ) @classmethod def extract_dates( cls, text, isarchive=False ): """ Returns tuple of the first and maybe last timestamp of a section. Last timestamp is only returned if there is a done notice or param *isarchiv* is set to 'True' @param text Text to search in @type line Any Type castable to str @param isarchive If true skip searching done notice (on archivepages) @type isarchive bool @returns Timestamps, otherwise None @returntype tuple of strs """ # Match all timestamps matches = cls.__timestamp_pat.findall( str( text ) ) if matches: # First one is beginning # Since some timestamps are broken we need to reconstruct them # by regex match groups beginning = ( matches[0][0] + ", " + matches[0][1] + ". " + matches[0][2] + ". " + matches[0][3] ) # Last one maybe is ending # Done notice format 1 # Done notice format 2 # Or on archivepages if ( cls.__done_notice in text or cls.__done_notice2 in text or isarchive ): ending = ( matches[-1][0] + ", " + matches[-1][1] + ". " + matches[-1][2] + ". " + matches[-1][3] ) else: ending = None # Missing dates (Task: FS#76) else: beginning = None ending = None return (beginning, ending) class RedFamWorker( RedFam ): """ Handles working with redundance families stored in database where discussion is finished """ def __init__( self, mysql_data ): articlesList = [] for key in sorted( mysql_data.keys() ): if 'article' in key and 'status' not in key and mysql_data[ key ]: articlesList.append( mysql_data[ key ] ) # Preset article status list with empty sets for existing articles self._article_status = [set() for x in range(0, len(articlesList))] super().__init__( articlesList, mysql_data[ 'beginning' ], mysql_data[ 'ending' ], mysql_data[ 'redpageid' ], mysql_data[ 'status' ], mysql_data[ 'famhash' ], mysql_data[ 'heading' ] ) self._mysql.data = mysql_data # Set up article status index = 0 for article in self._articlesList: raw_status = mysql_data[ "article" + str(index) + "_status" ] if not raw_status: raw_status = str() self._article_parse_status( raw_status, index ) index += 1 # Get related RedPage-Information self.redpageid = mysql_data[ 'pageid' ] self.redpagetitle = mysql_data[ 'pagetitle' ] # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') def article_generator(self, filter_existing=None, filter_redirects=None ): """ Yields pywikibot pageobjects for articles belonging to this redfams in a generator self. @param filter_existing Set to True to only get existing pages set to False to only get nonexisting pages unset/None results in not filtering @type filter_existing bool/None @param filter_redirects Set to True to get only noredirectpages, set to False to get only redirectpages, unset/None results in not filtering @type filter_redirects bool/None """ # Iterate over articles in redfam for article in self._articlesList: page = pywikibot.Page(pywikibot.Link(article), self.site) # Filter non existing Pages if requested with filter_existing=True if filter_existing and not page.exists(): continue # Filter existing pages if requested with filter_existing=False elif filter_existing is False and page.exists(): continue # Filter redirects if requested with filter_redirects=True if filter_redirects and page.isRedirectPage(): continue # Filter noredirects if requested with filter_redirects=False elif filter_redirects is False and not page.isRedirectPage(): continue # Yield filtered pages yield page def update_status( self ): """ Sets status to 3 when worked on """ pass def get_disc_link( self ): """ Constructs and returns the link to Redundancy discussion @returns Link to diskussion @rtype str """ # We need to Replace Links with their linktext anchor_code = mwparser.parse( self._mysql.data[ 'heading' ].strip() ) for link in anchor_code.ifilter_wikilinks(): if link.text: text = link.text else: text = link.title anchor_code.replace( link, text ) # Whitespace is replaced with underscores anchor_code.replace( " ", "_" ) # We try it with out any more parsing as mw will do while parsing page return ( self.redpagetitle + "#" + str(anchor_code).strip() ) def generate_disc_notice_template( self ): """ Generates notice template to add on discussion Pages of Articles when redundancy discussion is finished @return Notice template to add on article disc @rtype wikicode-node """ # Generate template boilerplate template = mwparser.nodes.template.Template( jogobot.config['redundances']['disc_notice_template_name']) # Index of first article's param param_cnt = 3 # Iterate over articles in redfam for article in self._articlesList: # Make sure to only use 8 articles (max. param 10) if param_cnt > 10: break # Add param for article template.add( param_cnt, article, True ) param_cnt += 1 # Add begin begin = self._mysql.data[ 'beginning' ].strftime( "%B %Y" ) template.add( "Beginn", begin, True ) # Add end (if not same as begin) end = self._mysql.data[ 'ending' ].strftime( "%B %Y" ) if not end == begin: template.add( "Ende", end, True ) # Add link to related reddisc template.add( "Diskussion", self.get_disc_link(), True ) # Add signature and timestamp # Not used atm # template.add( 1, "-- ~~~~", True ) return template @classmethod def list_by_status( cls, status ): """ Lists red_fams stored in db by given status """ mysql = MysqlRedFam() for fam in mysql.get_by_status( status ): try: print( cls( fam ) ) except RedFamHashError: print(fam) raise @classmethod def gen_by_status_and_ending( cls, status, ending ): """ Yield red_fams stored in db by given status which have an ending after given one """ mysql = MysqlRedFam() for fam in mysql.get_by_status_and_ending( status, ending ): try: yield cls( fam ) except RedFamHashError: print(fam) raise class RedFamError( Exception ): """ Base class for all Errors of RedFam-Module """ def __init__( self, message=None ): """ Handles Instantiation of RedFamError's """ if not message: self.message = "An Error occured while executing a RedFam action" else: self.message = message def __str__( self ): """ Output of error message """ return self.message class RedFamHashError( RedFamError ): """ Raised when given RedFamHash does not match with calculated """ def __init__( self, givenHash, calculatedHash ): message = "Given fam_hash ('{given}') does not match with \ calculated ('{calc}'".format( given=givenHash, calc=calculatedHash ) super().__init__( message ) class RedFamHeadingError ( RedFamError ): """ Raised when given RedFamHeading does not match __sectionhead_pat Regex """ def __init__( self, heading ): message = "Error while trying to parse section heading. Given heading \ '{heading}' does not match RegEx".format( heading=heading ) super().__init__( message )