From a8605bcee64a12fc2787492d978622710c4132a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Tue, 23 Aug 2016 21:50:22 +0200 Subject: [PATCH 1/4] Mv pages-parser.py to reddiscparser.py New, more meaningfull naming conventions, from redpage to reddisc (page) Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] --- parse-pages.py => reddiscparser.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename parse-pages.py => reddiscparser.py (100%) diff --git a/parse-pages.py b/reddiscparser.py similarity index 100% rename from parse-pages.py rename to reddiscparser.py From 6cb92c1da7cac0ecfa5875968c11aae9e8252aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Tue, 23 Aug 2016 21:53:44 +0200 Subject: [PATCH 2/4] Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] --- reddiscparser.py | 190 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 50 deletions(-) diff --git a/reddiscparser.py b/reddiscparser.py index 4545aef..2d7164f 100644 --- a/reddiscparser.py +++ b/reddiscparser.py @@ -22,11 +22,15 @@ # # """ -Script to parse all redpages in configured categories +Script to parse all reddisc pages in configured categories """ +import os +import sys + import pywikibot from pywikibot import pagegenerators +from pywikibot.bot import ExistingPageBot, NoRedirectPageBot import jogobot @@ -34,74 +38,160 @@ import redpage import redfam -def get_cat_pages( cat ): +class DiscussionParserBot( + # CurrentPageBot, # via next two sets 'current_page' on each treat() + ExistingPageBot, # CurrentPageBot only treats existing pages + NoRedirectPageBot ): # class which only treats non-redirects """ - Generates a iteratable generator-object with all pages listet in given - category - - @param cat Category to request - @type cat str - - @returns generator Iteratable object with pages of given category + Botclass witch initialises the parsing process of Redundancy Discussions """ - # Get site to work on from pywikibot config - site = pywikibot.Site() + def __init__( self, generator ): + """ + Constructor - # Retrieve the content of given category - category = pywikibot.Category( site, cat ) + Parameters: + @param generator: The page generator that determines on which pages + to work. + @type generator: generator. + """ + super( DiscussionParserBot, self ).__init__(generator=generator) - # Build an iteratable generator object with page objects for given category - generator = pagegenerators.CategorizedPageGenerator( category ) + def run( self ): + """ + Controls the overal parsing process, using super class for page switch - return generator + Needed to do things before/after treating pages is done + """ + try: + super( DiscussionParserBot, self ).run() -def main(*args): - """ - Handles process - """ + except: + raise - try: - jogobot.output( "BEGINN – parser-pages.py" ) + else: - # Iterate over configured categories - for cat in ( jogobot.config["redundances"]["redpage_cats"] ): + # If successfully parsed all pages in cat, flush db write cache + redpage.RedPage.flush_db_cache() - # Iterate over pages in current cat - for page in get_cat_pages( cat ): + def treat_page( self ): + """ + Handles work on current page + """ - # For pages configured to exclude, go on with next page - if page.title() in ( - jogobot.config["redundances"]["redpage_exclude"] ): + # Short circuit excluded pages + if self.current_page.title() in ( + jogobot.config["redundances"]["redpage_exclude"] ): - continue + return - # Initiate RedPage object - red_page = redpage.RedPage( page ) + # Initiate RedPage object + red_page = redpage.RedPage( self.current_page ) - # Check whether parsing is needed - if red_page.is_parsing_needed(): + # Check whether parsing is needed + if red_page.is_parsing_needed(): - # Iterate over returned generator with redfam sections - for fam in red_page.parse(): + # Iterate over returned generator with redfam sections + for fam in red_page.parse(): - # Run RedFamParser on section text - redfam.RedFamParser.parser( fam, red_page.page._pageid, - red_page.is_archive() ) - else: - # If successfully parsed whole page, flush - # db write cache - redfam.RedFamParser.flush_db_cache() - jogobot.output( "Page '%s' parsed" % - red_page.page.title() ) + # Run RedFamParser on section text + redfam.RedFamParser.parser( fam, red_page.page._pageid, + red_page.is_archive() ) else: - # If successfully parsed all pages in cat, flush db write cache - redpage.RedPage.flush_db_cache() + # If successfully parsed whole page, flush + # db write cache + redfam.RedFamParser.flush_db_cache() + jogobot.output( "Page [[{redisc}]] parsed".format( + reddisc=red_page.page.title() ) ) - finally: - jogobot.output( "END – parser-pages.py" ) - pywikibot.stopme() + +def main(*args): # noqa + """ + Process command line arguments and invoke bot. + + If args is an empty list, sys.argv is used. + + @param args: command line arguments + @type args: list of unicode + """ + + # Process global arguments to determine desired site + local_args = pywikibot.handle_args(args) + + # Get the jogobot-task_slug (basename of current file without ending) + task_slug = os.path.basename(__file__)[:-len(".py")] + + # Before run, we need to check wether we are currently active or not + try: + # Will throw Exception if disabled/blocked + # jogobot.is_active( task_slug ) + pass + + except jogobot.jogobot.Blocked: + (type, value, traceback) = sys.exc_info() + jogobot.output( "\03{lightpurple} %s (%s)" % (value, type ), + "CRITICAL" ) + + except jogobot.jogobot.Disabled: + (type, value, traceback) = sys.exc_info() + jogobot.output( "\03{red} %s (%s)" % (value, type ), + "ERROR" ) + + # Bot/Task is active + else: + + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # The generator gives the pages that should be worked upon. + gen = None + + # If always is True, bot won't ask for confirmation of edit (automode) + # always = False + + # If force_reload is True, bot will always parse Countrylist regardless + # if parsing is needed or not + # force_reload = False + + # Parse command line arguments + for arg in local_args: + if arg.startswith("-always"): + # always = True + pass + else: + genFactory.handleArg(arg) + + if not gen: + + # Check wether there are generators waiting for factoring, if not + # use configured categories + if not genFactory.gens: + + # Create Generators for configured Categories + for category in jogobot.config["redundances"]["redpage_cats"]: + cgen = genFactory.getCategoryGen( + category, + gen_func=pagegenerators.CategorizedPageGenerator) + + # If there is one, append to genFactory + if cgen: + genFactory.gens.append(cgen) + + # Create combined Generator (Union of all Generators) + gen = genFactory.getCombinedGenerator() + + if gen: + # Log beginning of parsing + jogobot.output( "{task_slug} invoked".format(task_slug=task_slug) ) + + # The preloading generator is responsible for downloading multiple + # pages from the wiki simultaneously. + gen = pagegenerators.PreloadingGenerator(gen) + DiscussionParserBot( gen ).run() + else: + pywikibot.showHelp() if( __name__ == "__main__" ): main() From 17bfb32dede157bf33272c1a025b357729850561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Wed, 24 Aug 2016 11:13:12 +0200 Subject: [PATCH 3/4] Building generators of config cats in sep Function Since the main()-Function was too complex the logic to build generators out of categories provided in jogobot.conf was moved in a separate function [https://fs.golderweb.de/index.php?do=details&task_id=73 FS#73] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] --- reddiscparser.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/reddiscparser.py b/reddiscparser.py index 2d7164f..cd9cf29 100644 --- a/reddiscparser.py +++ b/reddiscparser.py @@ -106,7 +106,26 @@ class DiscussionParserBot( reddisc=red_page.page.title() ) ) -def main(*args): # noqa +def apply_conf_cat_generators( genFactory ): + """ + Builds generators for categories which are read from jogobot.config + + Parameters: + @param genFactory: The GeneratorFactory to which the builded generators + should be added. + @type genFactory: pagegenerators.GeneratorFactory + """ + # Create Generators for configured Categories + for category in jogobot.config["redundances"]["redpage_cats"]: + cgen = genFactory.getCategoryGen( + category, gen_func=pagegenerators.CategorizedPageGenerator) + + # If there is one, append to genFactory + if cgen: + genFactory.gens.append(cgen) + + +def main(*args): """ Process command line arguments and invoke bot. @@ -168,16 +187,7 @@ def main(*args): # noqa # Check wether there are generators waiting for factoring, if not # use configured categories if not genFactory.gens: - - # Create Generators for configured Categories - for category in jogobot.config["redundances"]["redpage_cats"]: - cgen = genFactory.getCategoryGen( - category, - gen_func=pagegenerators.CategorizedPageGenerator) - - # If there is one, append to genFactory - if cgen: - genFactory.gens.append(cgen) + apply_conf_cat_generators( genFactory ) # Create combined Generator (Union of all Generators) gen = genFactory.getCombinedGenerator() From 2f878ee901051c5d58eb22f640d882dae99eaa92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Wed, 24 Aug 2016 11:20:28 +0200 Subject: [PATCH 4/4] Correct filename in header Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] --- reddiscparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reddiscparser.py b/reddiscparser.py index cd9cf29..6525ac9 100644 --- a/reddiscparser.py +++ b/reddiscparser.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# parse-pages.py +# reddiscparser.py # # Copyright 2016 GOLDERWEB – Jonathan Golder #