#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # reddiscparser.py # # Copyright 2016 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # """ Script to parse all reddisc pages in configured categories """ import os import sys import re import pywikibot from pywikibot import pagegenerators from pywikibot.bot import ExistingPageBot, NoRedirectPageBot import jogobot from lib import redpage from lib import redfam class DiscussionParserBot( # CurrentPageBot, # via next two sets 'current_page' on each treat() ExistingPageBot, # CurrentPageBot only treats existing pages NoRedirectPageBot ): # class which only treats non-redirects """ Botclass witch initialises the parsing process of Redundancy Discussions """ # RegEx to filter wrong pages onlyinclude_re = re.compile( jogobot.config["redundances"]["reddiscs_onlyinclude_re"] ) def __init__( self, generator ): """ Constructor Parameters: @param generator: The page generator that determines on which pages to work. @type generator: generator. """ super( DiscussionParserBot, self ).__init__(generator=generator) def run( self ): """ Controls the overal parsing process, using super class for page switch Needed to do things before/after treating pages is done """ try: super( DiscussionParserBot, self ).run() except: raise else: # If successfully parsed all pages in cat, flush db write cache redpage.RedPage.flush_db_cache() def treat_page( self ): """ Handles work on current page """ # Short circuit excluded pages if self.current_page.title() in ( jogobot.config["redundances"]["redpage_exclude"] ): return # Exclude pages which does not match pattern if not type(self).onlyinclude_re.search( self.current_page.title() ): return # Initiate RedPage object red_page = redpage.RedPage( self.current_page ) # Check whether parsing is needed if red_page.is_parsing_needed(): # Count families for failure analysis fam_counter = 0 # Iterate over returned generator with redfam sections for fam in red_page.parse(): # Run RedFamParser on section text redfam.RedFamParser.parser( fam, red_page.page, red_page.is_archive() ) fam_counter += 1 else: # If successfully parsed whole page, flush # db write cache if( fam_counter ): redfam.RedFamParser.flush_db_cache() jogobot.output( "Page [[{reddisc}]] parsed".format( reddisc=red_page.page.title() ) ) else: jogobot.output( "\03{red}" + "Page [[{reddisc}]], ".format( reddisc=red_page.page.title() ) + "containing no redfam, parsed!", "WARNING" ) def apply_conf_cat_generators( genFactory ): """ Builds generators for categories which are read from jogobot.config Parameters: @param genFactory: The GeneratorFactory to which the builded generators should be added. @type genFactory: pagegenerators.GeneratorFactory """ # Create Generators for configured Categories for category in jogobot.config["redundances"]["redpage_cats"]: cgen = genFactory.getCategoryGen( category, gen_func=pagegenerators.CategorizedPageGenerator) # If there is one, append to genFactory if cgen: genFactory.gens.append(cgen) def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: list of unicode """ # Process global arguments to determine desired site local_args = pywikibot.handle_args(args) # Get the jogobot-task_slug (basename of current file without ending) task_slug = os.path.basename(__file__)[:-len(".py")] # Before run, we need to check wether we are currently active or not try: # Will throw Exception if disabled/blocked # jogobot.is_active( task_slug ) pass except jogobot.jogobot.Blocked: (type, value, traceback) = sys.exc_info() jogobot.output( "\03{lightpurple} %s (%s)" % (value, type ), "CRITICAL" ) except jogobot.jogobot.Disabled: (type, value, traceback) = sys.exc_info() jogobot.output( "\03{red} %s (%s)" % (value, type ), "ERROR" ) # Bot/Task is active else: # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None # If always is True, bot won't ask for confirmation of edit (automode) # always = False # If force_reload is True, bot will always parse Countrylist regardless # if parsing is needed or not # force_reload = False # Parse command line arguments for arg in local_args: if arg.startswith("-always"): # always = True pass else: genFactory.handleArg(arg) if not gen: # Check wether there are generators waiting for factoring, if not # use configured categories if not genFactory.gens: apply_conf_cat_generators( genFactory ) # Create combined Generator (Union of all Generators) gen = genFactory.getCombinedGenerator() if gen: # Log beginning of parsing jogobot.output( "{task_slug} invoked".format(task_slug=task_slug) ) # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) DiscussionParserBot( gen ).run() else: pywikibot.showHelp() if( __name__ == "__main__" ): main()