#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # reddiscparser.py # # Copyright 2016 GOLDERWEB – Jonathan Golder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # """ Bot to parse all reddisc pages in given Generator or configured categories """ import re import pywikibot # noqa from pywikibot import pagegenerators # noqa from pywikibot.bot import ExistingPageBot, NoRedirectPageBot import jogobot from lib import redpage from lib import redfam class DiscussionParserBot( # CurrentPageBot, # via next two sets 'current_page' on each treat() ExistingPageBot, # CurrentPageBot only treats existing pages NoRedirectPageBot ): # class which only treats non-redirects """ Botclass witch initialises the parsing process of Redundancy Discussions """ # RegEx to filter wrong pages onlyinclude_re = re.compile( jogobot.config["redundances"]["reddiscs_onlyinclude_re"] ) def __init__( self, generator ): """ Constructor Parameters: @param generator: The page generator that determines on which pages to work. @type generator: generator. """ def build_generator(self): """ Builds generator to work on, based on self.genFactory """ # Check wether there are generators waiting for factoring, if not # use configured categories if not self.genFactory.gens: self.apply_conf_cat_generators() # Create combined Generator (Union of all Generators) gen = self.genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. self.gen = pagegenerators.PreloadingGenerator(gen) else: pywikibot.showHelp() def apply_conf_cat_generators( self ): """ Builds generators for categories which are read from jogobot.config Parameters: @param genFactory: The GeneratorFactory to which the builded generators should be added. @type genFactory: pagegenerators.GeneratorFactory """ # Create Generators for configured Categories for category in jogobot.config["redundances"]["redpage_cats"]: gen = self.genFactory.getCategoryGen( category, gen_func=pagegenerators.CategorizedPageGenerator) # If there is one, append to genFactory if gen: self.genFactory.gens.append(gen) # Reset gen for next iteration gen = None def run( self ): """ Controls the overal parsing process, using super class for page switch Needed to do things before/after treating pages is done """ try: super( DiscussionParserBot, self ).run() except: raise else: # If successfully parsed all pages in cat, flush db write cache redpage.RedPage.flush_db_cache() def treat_page( self ): """ Handles work on current page """ # Short circuit excluded pages if self.current_page.title() in ( jogobot.config["redundances"]["redpage_exclude"] ): return # Exclude pages which does not match pattern if not type(self).onlyinclude_re.search( self.current_page.title() ): return # Initiate RedPage object red_page = redpage.RedPage( self.current_page ) # Check whether parsing is needed if red_page.is_parsing_needed(): # Count families for failure analysis fam_counter = 0 # Iterate over returned generator with redfam sections for fam in red_page.parse(): # Run RedFamParser on section text redfam.RedFamParser.parser( fam, red_page.page, red_page.is_archive() ) fam_counter += 1 else: # If successfully parsed whole page, flush # db write cache if( fam_counter ): redfam.RedFamParser.flush_db_cache() jogobot.output( "Page [[{reddisc}]] parsed".format( reddisc=red_page.page.title() ) ) else: jogobot.output( "\03{red}" + "Page [[{reddisc}]], ".format( reddisc=red_page.page.title() ) + "containing no redfam, parsed!", "WARNING" )