#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # redfam.py # # Copyright 2015 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # """ Provides classes for working with RedFams """ import hashlib import locale import re from datetime import datetime import pywikibot import jogobot from .mysqlred import MysqlRedFam class RedFam: """ Basic class for RedFams, containing the basic data structure """ def __init__( self, articlesList, beginning, ending=None, red_page_id=None, status=0, fam_hash=None, heading=None ): """ Generates a new RedFam object @param articlesList list List of articles @param beginning datetime Beginning date @param ending datetime Ending date @param red_page_id int MW pageid of containing RedPage @param status int Status of RedFam @param fam_hash str SHA1 hash of articlesList @param heading str Original heading of RedFam (Link) """ # Initial attribute values self._articlesList = articlesList self._beginning = beginning self._ending = ending self._red_page_id = red_page_id self._status = status self._fam_hash = fam_hash self._heading = heading # Calculates the sha1 hash over self._articlesList to # rediscover known redundance families self.calc_fam_hash() def __repr__( self ): """ Returns repression str of RedFam object @returns str repr() string """ __repr = "RedFam( " + \ "articlesList=" + repr( self._articlesList ) + \ ", heading=" + repr( self._heading ) + \ ", beginning=" + repr( self._beginning ) + \ ", ending=" + repr( self._ending ) + \ ", red_page_id=" + repr( self._red_page_id ) + \ ", status=" + repr( self._status ) + \ ", fam_hash=" + repr( self._fam_hash ) + \ " )" return __repr def calc_fam_hash( self ): """ Calculates the SHA-1 hash for the articlesList of redundance family. Since we don't need security SHA-1 is just fine. @returns str String with the hexadecimal hash digest """ h = hashlib.sha1() h.update( str( self._articlesList[:8] ).encode('utf-8') ) if self._fam_hash and h.hexdigest() != self._fam_hash: raise RedFamHashError( self._fam_hash, h.hexdigest() ) elif self._fam_hash: return else: self._fam_hash = h.hexdigest() @classmethod def flush_db_cache( cls ): """ Calls flush method of Mysql Interface class """ MysqlRedFam.flush() class RedFamParser( RedFam ): """ Provides an interface to RedFam for adding/updating redundance families while parsig redundance pages """ # Define the timestamp format __timestamp_format = jogobot.timestamp_format # Define section heading re.pattern __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" ) # Define timestamp re.pattern __timestamp_pat = re.compile( jogobot.timestamp_regex ) # Textpattern for recognisation of done-notices __done_notice = ":Archivierung dieses Abschnittes \ wurde gewünscht von:" __done_notice2 = "{{Erledigt|" def __init__( self, heading, red_page_id, red_page_archive, beginning, ending=None ): """ Creates a RedFam object based on data collected while parsing red_pages combined with possibly former known data from db @param red_fam_heading str Wikitext heading of section @param red_page_id int MediaWiki page_id @param red_page_archive bool Is red_page an archive @param beginning datetime Timestamp of beginning str as strptime parseable string @param ending datetime Timestamp of ending str strptime parseable string """ # Set object attributes: self._red_page_id = red_page_id self._red_page_archive = red_page_archive self._fam_hash = None # Method self.add_beginning sets self._beginning directly self.add_beginning( beginning ) # Method self.add_ending sets self._ending directly if( ending ): self.add_ending( ending ) else: # If no ending was provided set to None self._ending = None self._status = None # Parse the provided heading of redundance section # to set self._articlesList self.heading_parser( heading ) # Calculates the sha1 hash over self._articlesList to # rediscover known redundance families self.calc_fam_hash() # Open database connection, ask for data if existing, # otherwise create entry self.__handle_db() # Check status changes self.status() # Triggers db update if anything changed self.changed() def __handle_db( self ): """ Handles opening of db connection """ # We need a connection to our mysqldb self.__mysql = MysqlRedFam( ) self.__mysql.get_fam( self._fam_hash ) if not self.__mysql.data: self.__mysql.add_fam( self._articlesList, self._heading, self._red_page_id, self._beginning, self._ending ) def heading_parser( self, heading ): """ Parses given red_fam_heading string and saves articles list """ # Predefine a pattern for wikilinks' destination wikilink_pat = re.compile( r"\[\[([^\[\]\|]+)(?:\]\]|\|)" ) # Parse content of heading for generating section links later match = type( self ).__sectionhead_pat.search( heading ) if match: self._heading = match.group(2).strip() else: raise RedFamHeadingError( heading ) # We get the pages in first [0] element iterating over # wikilink_pat.findall( line ) # Strip leading and trailing whitespace in Links to prevent wrong # fam_hashes (when receiving redfam from db) since MySQL drops it self._articlesList = [ link.strip() for link in wikilink_pat.findall( self._heading ) ] # Catch sections with more then 8 articles, print error if len( self._articlesList ) > 8: # For repression in output we need to know the fam hash self.calc_fam_hash() pywikibot.output( "\ {datetime} – \03{{lightred}}[WARNING] – \ Maximum number of articles in red_fam exceeded, maximum number is 8, \ {number:d} were given \n {repress}".format( datetime=datetime.now().strftime( "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ), repress=repr( self ) ) ) self._articlesList = self._articlesList[:8] def add_beginning( self, beginning ): """ Adds the beginning date of a redundance diskussion to the object @param datetime datetime Beginning date """ self._beginning = self.__datetime( beginning ) def add_ending( self, ending ): """ Adds the ending date of a redundance diskussion to the object. @param datetime datetime Ending date """ self._ending = self.__datetime( ending ) def __datetime( self, timestamp ): """ Decides wether given timestamp is a parseable string or a datetime object and returns a datetime object in both cases @param datetime timestamp Datetime object str timestamp Parseable string with timestamp @returns datetime Datetime object """ # Make sure locale is set to 'de_DE.UTF-8' to prevent problems # with wrong month abreviations in strptime locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') if( isinstance( timestamp, datetime ) ): return timestamp else: result = datetime.strptime( timestamp, type( self ).__timestamp_format ) return result def status( self ): """ Handles detection of correct status There are three possible stati: - 0 Discussion running --> no ending, page is not an archive - 1 Discussion over --> ending present, page is not an archive - 2 Discussion archived --> ending (normaly) present, page is archive - 3 and greater status was set by worker script, do not change it """ # Do not change stati set by worker script etc. if not self.__mysql.data['status'] > 2: # No ending, discussion is running: # Sometimes archived discussions also have no detectable ending if not self._ending and not self._red_page_archive: self._status = 0 else: if not self._red_page_archive: self._status = 1 else: self._status = 2 else: self._status = self.__mysql.data[ 'status' ] def changed( self ): """ Checks wether anything has changed and maybe triggers db update """ # On archived red_fams do not delete possibly existing ending if( not self._ending and self._status > 1 and self.__mysql.data[ 'ending' ] ): self._ending = self.__mysql.data[ 'ending' ] # Since status change means something has changed, update database if( self._status != self.__mysql.data[ 'status' ] or self._beginning != self.__mysql.data[ 'beginning' ] or self._ending != self.__mysql.data[ 'ending' ] or self._red_page_id != self.__mysql.data[ 'red_page_id' ] or self._heading != self.__mysql.data[ 'heading' ]): self.__mysql.update_fam( self._red_page_id, self._heading, self._beginning, self._ending, self._status ) @classmethod def is_sectionheading( cls, line ): """ Checks wether given line is a red_fam section heading @param str line String to check @returns bool Returns True if it is a section heading """ if cls.__sectionhead_pat.search( line ): return True else: return False @classmethod def is_beginning( cls, line ): """ Returns the first timestamp found in line, otherwise None @param str line String to search in @returns str Timestamp, otherwise None """ match = cls.__timestamp_pat.search( line ) if match: # Since some timestamps are broken we need to reconstruct them # by regex match groups result = match.group(1) + ", " + match.group(2) + ". " +\ match.group(3) + ". " + match.group(4) return result else: return None @classmethod def is_ending( cls, line ): """ Returns the timestamp of done notice ( if one ), otherwise None @param str line String to search in @returns str Timestamp, otherwise None """ if ( cls.__done_notice in line ) or ( cls.__done_notice2 in line ): match = cls.__timestamp_pat.search( line ) if match: # Since some timestamps are broken we need to reconstruct them # by regex match groups result = match.group(1) + ", " + match.group(2) + ". " +\ match.group(3) + ". " + match.group(4) return result return None @classmethod def is_ending2( cls, line ): """ Returns the last timestamp found in line, otherwise None @param str line String to search in @returns str Timestamp, otherwise None """ matches = cls.__timestamp_pat.findall( line ) if matches: # Since some timestamps are broken we need to reconstruct them # by regex match groups result = matches[-1][0] + ", " + matches[-1][1] + ". " +\ matches[-1][2] + ". " + matches[-1][3] return result else: return None class RedFamWorker( RedFam ): """ Handles working with redundance families stored in database where discussion is finished """ def __init__( self, mysql_data ): articlesList = [] for key in sorted( mysql_data.keys() ): if 'article' in key and mysql_data[ key ]: articlesList.append( mysql_data[ key ] ) super().__init__( articlesList, mysql_data[ 'beginning' ], mysql_data[ 'ending' ], mysql_data[ 'red_page_id' ], mysql_data[ 'status' ], mysql_data[ 'fam_hash' ], mysql_data[ 'heading' ] ) @classmethod def list_by_status( cls, status ): """ Lists red_fams stored in db by given status """ mysql = MysqlRedFam() for fam in mysql.get_by_status( status ): try: print( cls( fam ) ) except RedFamHashError: print(fam) raise class RedFamError( Exception ): """ Base class for all Errors of RedFam-Module """ def __init__( self, message=None ): """ Handles Instantiation of RedFamError's """ if not message: self.message = "An Error occured while executing a RedFam action" else: self.message = message def __str__( self ): """ Output of error message """ return self.message class RedFamHashError( RedFamError ): """ Raised when given RedFamHash does not match with calculated """ def __init__( self, givenHash, calculatedHash ): message = "Given fam_hash ('{given}') does not match with \ calculated ('{calc}'".format( given=givenHash, calc=calculatedHash ) super().__init__( message ) class RedFamHeadingError ( RedFamError ): """ Raised when given RedFamHeading does not match __sectionhead_pat Regex """ def __init__( self, heading ): message = "Error while trying to parse section heading. Given heading \ '{heading}' does not match RegEx".format( heading=heading ) super().__init__( message )