diff --git a/bots/__init__.py b/bots/__init__.py new file mode 100644 index 0000000..9327388 --- /dev/null +++ b/bots/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- diff --git a/bots/reddiscparser.py b/bots/reddiscparser.py new file mode 100644 index 0000000..818eb05 --- /dev/null +++ b/bots/reddiscparser.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# reddiscparser.py +# +# Copyright 2016 GOLDERWEB – Jonathan Golder +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# +# +""" +Bot to parse all reddisc pages in given Generator or configured categories +""" + +import re + +import pywikibot # noqa +from pywikibot import pagegenerators # noqa +from pywikibot.bot import ExistingPageBot, NoRedirectPageBot + +import jogobot + +from lib import redpage +from lib import redfam + + +class DiscussionParserBot( + # CurrentPageBot, # via next two sets 'current_page' on each treat() + ExistingPageBot, # CurrentPageBot only treats existing pages + NoRedirectPageBot ): # class which only treats non-redirects + """ + Botclass witch initialises the parsing process of Redundancy Discussions + """ + + # RegEx to filter wrong pages + onlyinclude_re = re.compile( + jogobot.config["redundances"]["reddiscs_onlyinclude_re"] ) + + def __init__( self, genFactory, **kwargs ): + """ + Constructor + + Parameters: + @param genFactory GenFactory with parsed pagegenerator args to + build generator + @type genFactory pagegenerators.GeneratorFactory + @param **kwargs Additional args + @type iterable + """ + + # Copy needed args + self.genFactory = genFactory + + # Build generator with genFactory + self.build_generator() + + # Run super class init with builded generator + super( DiscussionParserBot, self ).__init__(generator=self.gen) + + def build_generator(self): + """ + Builds generator to work on, based on self.genFactory + """ + # Check wether there are generators waiting for factoring, if not + # use configured categories + if not self.genFactory.gens: + self.apply_conf_cat_generators() + + # Create combined Generator (Union of all Generators) + gen = self.genFactory.getCombinedGenerator() + + if gen: + # The preloading generator is responsible for downloading multiple + # pages from the wiki simultaneously. + self.gen = pagegenerators.PreloadingGenerator(gen) + + else: + pywikibot.showHelp() + + def apply_conf_cat_generators( self ): + """ + Builds generators for categories which are read from jogobot.config + + Parameters: + @param genFactory: The GeneratorFactory to which the builded + generators should be added. + @type genFactory: pagegenerators.GeneratorFactory + """ + # Create Generators for configured Categories + for category in jogobot.config["redundances"]["redpage_cats"]: + gen = self.genFactory.getCategoryGen( + category, gen_func=pagegenerators.CategorizedPageGenerator) + + # If there is one, append to genFactory + if gen: + self.genFactory.gens.append(gen) + + # Reset gen for next iteration + gen = None + + def run( self ): + """ + Controls the overal parsing process, using super class for page switch + + Needed to do things before/after treating pages is done + """ + try: + + super( DiscussionParserBot, self ).run() + + except: + raise + + else: + + # If successfully parsed all pages in cat, flush db write cache + redpage.RedPage.flush_db_cache() + + def treat_page( self ): + """ + Handles work on current page + """ + + # Short circuit excluded pages + if self.current_page.title() in ( + jogobot.config["redundances"]["redpage_exclude"] ): + + return + + # Exclude pages which does not match pattern + if not type(self).onlyinclude_re.search( self.current_page.title() ): + + return + + # Initiate RedPage object + red_page = redpage.RedPage( self.current_page ) + + # Check whether parsing is needed + if red_page.is_parsing_needed(): + + # Count families for failure analysis + fam_counter = 0 + + # Iterate over returned generator with redfam sections + for fam in red_page.parse(): + + # Run RedFamParser on section text + redfam.RedFamParser.parser( fam, red_page.page, + red_page.is_archive() ) + + fam_counter += 1 + + else: + # If successfully parsed whole page, flush + # db write cache + if( fam_counter ): + redfam.RedFamParser.flush_db_cache() + jogobot.output( "Page [[{reddisc}]] parsed".format( + reddisc=red_page.page.title() ) ) + else: + jogobot.output( + "\03{red}" + "Page [[{reddisc}]], ".format( + reddisc=red_page.page.title() ) + + "containing no redfam, parsed!", + "WARNING" ) diff --git a/jogobot b/jogobot index 2173f29..28d03f3 160000 --- a/jogobot +++ b/jogobot @@ -1 +1 @@ -Subproject commit 2173f2984f1de6950728a15709bf93db5188731d +Subproject commit 28d03f35b848a33ad45d3f5f8f3f82e8c45534ec diff --git a/mysqlred.py b/lib/mysqlred.py similarity index 83% rename from mysqlred.py rename to lib/mysqlred.py index 055b995..499816f 100644 --- a/mysqlred.py +++ b/lib/mysqlred.py @@ -33,6 +33,7 @@ except ImportError: import atexit +import pywikibot from pywikibot import config import jogobot @@ -53,6 +54,7 @@ class MysqlRed: db_username = config.db_username db_password = config.db_password db_name = config.db_username + jogobot.config['db_suffix'] + db_table_prefix = False # Class variables for storing cached querys _cached_update_data = [] @@ -67,6 +69,14 @@ class MysqlRed: @returns mysql-stream MySQL Connection """ + # Needs to be generated after Parsing of Args (not at import time) + if not type(self).db_table_prefix: + type(self).db_table_prefix = \ + pywikibot.Site().family.dbName(pywikibot.Site().code) + + # Now we can setup prepared queries + self._prepare_queries() + # Connect to mysqldb only once if not type( self ).connection: @@ -87,11 +97,23 @@ class MysqlRed: type( self ).connection.close() + def _prepare_queries( self ): + """ + Used to replace placeholders in prepared queries + """ + type(self)._update_query = type(self)._update_query.format( + prefix=type(self).db_table_prefix) + type(self)._insert_query = type(self)._insert_query.format( + prefix=type(self).db_table_prefix) + @classmethod def flush( cls ): """ Run cached querys """ + if not cls.connection: + raise MysqlRedConnectionError( "No connection exists!" ) + cursor = cls.connection.cursor() # Execute insert query @@ -132,12 +154,13 @@ class MysqlRedPage( MysqlRed ): """ # Class variables for storing cached querys + # '{prefix}' will be replaced during super().__init__() _cached_update_data = [] - _update_query = 'UPDATE `red_pages` \ + _update_query = 'UPDATE `{prefix}_red_pages` \ SET `page_title` = ?, `rev_id` = ?, `status`= ? WHERE `page_id` = ?;' _cached_insert_data = {} - _insert_query = 'INSERT INTO `red_pages` \ + _insert_query = 'INSERT INTO `{prefix}_red_pages` \ ( page_id, page_title, rev_id, status ) VALUES ( ?, ?, ?, ? );' def __init__( self, page_id ): @@ -166,8 +189,10 @@ SET `page_title` = ?, `rev_id` = ?, `status`= ? WHERE `page_id` = ?;' cursor = type( self ).connection.cursor(mysqldb.DictCursor) - cursor.execute( 'SELECT * FROM `red_pages` WHERE `page_id` = ?;', - ( self.__page_id, ) ) + cursor.execute( + 'SELECT * FROM `{prefix}_red_pages` WHERE `page_id` = ?;'.format( + prefix=type(self).db_table_prefix), ( self.__page_id, ) ) + res = cursor.fetchone() if res: @@ -218,12 +243,11 @@ class MysqlRedFam( MysqlRed ): # Class variables for storing cached querys _cached_update_data = [] - _update_query = 'UPDATE `red_families` \ + _update_query = 'UPDATE `{prefix}_red_families` \ SET `red_page_id` = ?, `heading` = ?, `beginning` = ?, `ending` = ?, \ `status`= ? WHERE `fam_hash` = ?;' - _cached_insert_data = {} - _insert_query = 'INSERT INTO `red_families` \ + _insert_query = 'INSERT INTO `{prefix}_red_families` \ ( fam_hash, red_page_id, beginning, ending, status, heading, \ article0, article1, article2, article3, article4, article5, article6, \ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );' @@ -249,8 +273,10 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );' cursor = type( self ).connection.cursor( mysqldb.DictCursor ) - cursor.execute( 'SELECT * FROM `red_families` WHERE `fam_hash` = ?;', - ( fam_hash, ) ) + cursor.execute( + 'SELECT * FROM `{prefix}_red_families` WHERE `fam_hash` = ?;'. + format( prefix=type(self).db_table_prefix), ( fam_hash, ) ) + self.data = cursor.fetchone() def add_fam( self, articlesList, heading, red_page_id, @@ -298,8 +324,9 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );' cursor = type( self ).connection.cursor( mysqldb.DictCursor ) - cursor.execute( 'SELECT * FROM `red_families` WHERE `status` = ?;', - ( status, ) ) + cursor.execute( + 'SELECT * FROM `{prefix}_red_families` WHERE `status` = ?;'.format( + prefix=type( self ).db_table_prefix), ( status, ) ) while True: res = cursor.fetchmany( 1000 ) @@ -307,3 +334,17 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );' break for row in res: yield row + + +class MysqlRedError(Exception): + """ + Basic Exception class for this module + """ + pass + + +class MysqlRedConnectionError(MysqlRedError): + """ + Raised if there are Errors with Mysql-Connections + """ + pass diff --git a/redfam.py b/lib/redfam.py similarity index 88% rename from redfam.py rename to lib/redfam.py index 3dec12f..30dd22d 100644 --- a/redfam.py +++ b/lib/redfam.py @@ -35,7 +35,7 @@ import pywikibot # noqa from pywikibot.tools import deprecated # noqa import jogobot -from mysqlred import MysqlRedFam +from lib.mysqlred import MysqlRedFam class RedFam: @@ -137,14 +137,14 @@ class RedFamParser( RedFam ): wurde gewünscht von:" __done_notice2 = "{{Erledigt|" - def __init__( self, heading, red_page_id, red_page_archive, + def __init__( self, heading, red_page, red_page_archive, beginning, ending=None ): """ Creates a RedFam object based on data collected while parsing red_pages combined with possibly former known data from db @param red_fam_heading str Wikitext heading of section - @param red_page_id int MediaWiki page_id + @param red_page page Pywikibot.page object @param red_page_archive bool Is red_page an archive @param beginning datetime Timestamp of beginning str as strptime parseable string @@ -153,7 +153,7 @@ class RedFamParser( RedFam ): """ # Set object attributes: - self._red_page_id = red_page_id + self._red_page_id = red_page._pageid self._red_page_archive = red_page_archive self._fam_hash = None @@ -210,13 +210,14 @@ class RedFamParser( RedFam ): @type heading wikicode or mwparser-parseable """ - # Parse heading with mwparse if needed - if not isinstance( heading, mwparser.wikicode.Wikicode ): - heading = mwparser.parse( heading ) - # Save heading as string self._heading = str( heading ) + # Parse string heading with mwparse again everytime + # In some cases the given wikicode is broken due to syntax errors + # (Task FS#77) + heading = mwparser.parse( self._heading ) + # Save destinations of wikilinks in headings self._articlesList = [ str( link.title ) for link in heading.ifilter_wikilinks() ] @@ -325,23 +326,23 @@ class RedFamParser( RedFam ): self._status ) @classmethod - @deprecated - def is_sectionheading( cls, line ): + def is_section_redfam_cb( cls, heading ): """ - Checks wether given line is a red_fam section heading - - @param str line String to check - - @returns bool Returns True if it is a section heading + Used as callback for wikicode.get_sections in redpage.parse to + select sections which are redfams """ + # Because of strange behavior in some cases, parse heading again + # (Task FS#77) + heading = mwparser.parse( str( heading ) ) - if cls.__sectionhead_pat.search( str(line) ): + # Make sure we have min. two wikilinks in heading to assume a redfam + if len( heading.filter_wikilinks() ) >= 2: return True else: return False @classmethod - def parser( cls, text, pageid, isarchive=False ): + def parser( cls, text, page, isarchive=False ): """ Handles parsing of redfam section @@ -359,8 +360,21 @@ class RedFamParser( RedFam ): # Extract beginnig and maybe ending (beginning, ending) = RedFamParser.extract_dates( text, isarchive ) + # Missing beginning (Task: FS#76) + # Use first day of month of reddisc + if not beginning: + match = re.search( + jogobot.config["redundances"]["reddiscs_onlyinclude_re"], + page.title() ) + + if match: + beginning = datetime.strptime( + "01. {month} {year}".format( + month=match.group(1), year=match.group(2)), + "%d. %B %Y" ) + # Create the RedFam object - RedFamParser( heading, pageid, isarchive, beginning, ending ) + RedFamParser( heading, page, isarchive, beginning, ending ) @classmethod def extract_dates( cls, text, isarchive=False ): @@ -401,51 +415,13 @@ class RedFamParser( RedFam ): else: ending = None + # Missing dates (Task: FS#76) + else: + beginning = None + ending = None return (beginning, ending) - @classmethod - @deprecated( 'extract_dates' ) - def is_beginning( cls, line ): - """ - Returns the first timestamp found in line, otherwise None - - @param str line String to search in - - @returns str Timestamp, otherwise None - """ - - return cls.extract_dates( line )[0] - - @classmethod - @deprecated( 'extract_dates' ) - def is_ending( cls, line, isarchive=False ): - """ - Returns the timestamp of done notice ( if one ), otherwise None - - @param line String to search in - @type line str - @param isarchive If true skip searching done notice (on archivepages) - @type isarchive bool - - @returns Timestamp, otherwise None - @returntype str - """ - - return cls.extract_dates( line )[1] - - @classmethod - @deprecated( 'extract_dates' ) - def is_ending2( cls, line ): - """ - Returns the last timestamp found in line, otherwise None - @param str line String to search in - - @returns str Timestamp, otherwise None - """ - - return cls.extract_dates( line, True )[1] - class RedFamWorker( RedFam ): """ diff --git a/redpage.py b/lib/redpage.py similarity index 96% rename from redpage.py rename to lib/redpage.py index 2b93ae8..176f6bc 100644 --- a/redpage.py +++ b/lib/redpage.py @@ -28,9 +28,10 @@ Provides a class for handling redundance discussion pages and archives import pywikibot # noqa import mwparserfromhell as mwparser -import jogobot +import jogobot # noqa -from mysqlred import MysqlRedPage +from lib.mysqlred import MysqlRedPage +from lib.redfam import RedFamParser class RedPage: @@ -116,7 +117,7 @@ class RedPage: # include_lead = if true include first section (intro) # include_heading = if true include heading fams = self.wikicode.get_sections( - matches=jogobot.config["redundances"]["section_heading_regex"], + matches=RedFamParser.is_section_redfam_cb, include_lead=False, include_headings=True ) # Iterate over RedFam diff --git a/parse-pages.py b/parse-pages.py deleted file mode 100644 index 4545aef..0000000 --- a/parse-pages.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# parse-pages.py -# -# Copyright 2016 GOLDERWEB – Jonathan Golder -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, -# MA 02110-1301, USA. -# -# -""" -Script to parse all redpages in configured categories -""" - -import pywikibot -from pywikibot import pagegenerators - -import jogobot - -import redpage -import redfam - - -def get_cat_pages( cat ): - """ - Generates a iteratable generator-object with all pages listet in given - category - - @param cat Category to request - @type cat str - - @returns generator Iteratable object with pages of given category - """ - - # Get site to work on from pywikibot config - site = pywikibot.Site() - - # Retrieve the content of given category - category = pywikibot.Category( site, cat ) - - # Build an iteratable generator object with page objects for given category - generator = pagegenerators.CategorizedPageGenerator( category ) - - return generator - - -def main(*args): - """ - Handles process - """ - - try: - jogobot.output( "BEGINN – parser-pages.py" ) - - # Iterate over configured categories - for cat in ( jogobot.config["redundances"]["redpage_cats"] ): - - # Iterate over pages in current cat - for page in get_cat_pages( cat ): - - # For pages configured to exclude, go on with next page - if page.title() in ( - jogobot.config["redundances"]["redpage_exclude"] ): - - continue - - # Initiate RedPage object - red_page = redpage.RedPage( page ) - - # Check whether parsing is needed - if red_page.is_parsing_needed(): - - # Iterate over returned generator with redfam sections - for fam in red_page.parse(): - - # Run RedFamParser on section text - redfam.RedFamParser.parser( fam, red_page.page._pageid, - red_page.is_archive() ) - else: - # If successfully parsed whole page, flush - # db write cache - redfam.RedFamParser.flush_db_cache() - jogobot.output( "Page '%s' parsed" % - red_page.page.title() ) - else: - # If successfully parsed all pages in cat, flush db write cache - redpage.RedPage.flush_db_cache() - - finally: - jogobot.output( "END – parser-pages.py" ) - pywikibot.stopme() - -if( __name__ == "__main__" ): - main() diff --git a/red.py b/red.py new file mode 100644 index 0000000..733def2 --- /dev/null +++ b/red.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# reddiscparser.py +# +# Copyright 2016 GOLDERWEB – Jonathan Golder +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# +# +""" +Wrapper script to invoke all redundances bot tasks +""" + +import os + +import pywikibot + +import jogobot + + +def prepare_bot( task_slug, subtask, genFactory, subtask_args ): + """ + Handles importing subtask Bot class and prepares specific args + + Throws exception if bot not exists + + @param task_slug Task slug, needed for logging + @type task_slug str + @param subtask Slug of given subtask + @type subtask str + @param genFactory GenFactory with parsed pagegenerator args + @type genFactory pagegenerators.GeneratorFactory + @param subtask_args Additional args for subtasks + @type subtask_args dict\ + + @returns The following tuple + @return 1 Subtask slug (replaced None for default) + @rtype str + @return 2 Botclass of given subtask (Arg "-task") + @rtype Class + @return 3 GenFactory with parsed pagegenerator args + @rtype pagegenerators.GeneratorFactory + @return 4 Additional args for subtasks + @rtype dict + @rtype tuple + """ + # kwargs are passed to selected bot as **kwargs + kwargs = dict() + + if not subtask or subtask == "discparser": + # Default case: discparser + subtask = "discparser" + + # Import related bot + from bots.reddiscparser import DiscussionParserBot as Bot + + # Subtask error + else: + jogobot.output( ( + "\03{{red}} Given subtask \"{subtask} \"" + + "is not existing!" ).format( subtask=subtask ), "ERROR" ) + raise Exception + + return ( subtask, Bot, genFactory, kwargs ) + + +def main(*args): + """ + Process command line arguments and invoke bot. + + If args is an empty list, sys.argv is used. + + @param args: command line arguments + @type args: list of unicode + """ + + # Process global arguments to determine desired site + local_args = pywikibot.handle_args(args) + + # Get the jogobot-task_slug (basename of current file without ending) + task_slug = os.path.basename(__file__)[:-len(".py")] + + # Disabled until [FS#86] is done + # Before run, we need to check wether we are currently active or not + # if not jogobot.bot.active( task_slug ): + # return + + # Parse local Args to get information about subtask + ( subtask, genFactory, subtask_args ) = jogobot.bot.parse_local_args( + local_args ) + + # select subtask and prepare args + ( subtask, Bot, genFactory, kwargs ) = prepare_bot( + task_slug, subtask, genFactory, subtask_args ) + + # Init Bot + bot = jogobot.bot.init_bot( task_slug, subtask, Bot, genFactory, **kwargs) + + # Run bot + jogobot.bot.run_bot( task_slug, subtask, bot ) + + +if( __name__ == "__main__" ): + main()