jogobot-red/bots/reddiscparser.py

#!/usr/bin/env python3
# -*- coding: utf-8  -*-
#
#  reddiscparser.py
#
#  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#
"""
Bot to parse all reddisc pages in given Generator or configured categories
"""

import re

import pywikibot  # noqa
from pywikibot import pagegenerators  # noqa
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot

import jogobot

from lib.redpage import RedPage
from lib.redfam import RedFamParser


class DiscussionParserBot(
        # CurrentPageBot,  # via next two sets 'current_page' on each treat()
        ExistingPageBot,  # CurrentPageBot only treats existing pages
        NoRedirectPageBot ):  # class which only treats non-redirects
    """
    Botclass witch initialises the parsing process of Redundancy Discussions
    """

    # RegEx to filter wrong pages
    onlyinclude_re = re.compile(
        jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )

    def __init__( self, genFactory, **kwargs ):
        """
        Constructor

        Parameters:
            @param  genFactory  GenFactory with parsed pagegenerator args to
                                build generator
            @type  genFactory  pagegenerators.GeneratorFactory
            @param  **kwargs  Additional args
            @type  iterable
        """

        # Copy needed args
        self.genFactory = genFactory

        # Build generator with genFactory
        self.build_generator()

        # Run super class init with builded generator
        super( DiscussionParserBot, self ).__init__(generator=self.gen)

    def build_generator(self):
        """
        Builds generator to work on, based on self.genFactory
        """
        # Check wether there are generators waiting for factoring, if not
        # use configured categories
        if not self.genFactory.gens:
            self.apply_conf_cat_generators()

        # Create combined Generator (Union of all Generators)
        gen = self.genFactory.getCombinedGenerator()

        if gen:
            # The preloading generator is responsible for downloading multiple
            # pages from the wiki simultaneously.
            self.gen = pagegenerators.PreloadingGenerator(gen)

        else:
            pywikibot.showHelp()

    def apply_conf_cat_generators( self ):
        """
        Builds generators for categories which are read from jogobot.config

        Parameters:
            @param genFactory: The GeneratorFactory to which the builded
                               generators should be added.
            @type genFactory: pagegenerators.GeneratorFactory
        """
        # Create Generators for configured Categories
        for category in jogobot.config["redundances"]["redpage_cats"]:
            gen = self.genFactory.getCategoryGen(
                category, gen_func=pagegenerators.CategorizedPageGenerator)

            # If there is one, append to genFactory
            if gen:
                self.genFactory.gens.append(gen)

            # Reset gen for next iteration
            gen = None

    def run( self ):
        """
        Controls the overal parsing process, using super class for page switch

        Needed to do things before/after treating pages is done
        """
        try:

            super( DiscussionParserBot, self ).run()

        except:
            raise

        else:

            # If successfully parsed all pages in cat, flush db write cache
            RedPage.flush_db_cache()

    def treat_page( self ):
        """
        Handles work on current page
        """

        # Short circuit excluded pages
        if self.current_page.title() in (
                jogobot.config["redundances"]["redpage_exclude"] ):

            return

        # Exclude pages which does not match pattern
        if not type(self).onlyinclude_re.search( self.current_page.title() ):

            return

        # Initiate RedPage object
        redpage = RedPage.session.query(RedPage).filter(RedPage.pageid == self.current_page.pageid ).one_or_none()

        if redpage:
            redpage.update( self.current_page )
        else:
            redpage = RedPage( self.current_page )

        #~ # Check whether parsing is needed
        if redpage.is_parsing_needed():
            # Count families for failure analysis
            fam_counter = 0

            # Iterate over returned generator with redfam sections
            for fam in redpage.parse():
                # Run RedFamParser on section text
                RedFamParser.parser( fam, redpage,
                                            redpage.is_archive() )

                fam_counter += 1

            else:
                # If successfully parsed whole page, flush
                # db write cache
                if( fam_counter ):

                    RedFamParser.flush_db_cache()
                    jogobot.output( "Page [[{reddisc}]] parsed".format(
                        reddisc=redpage.page.title() ) )
                else:
                    jogobot.output(
                        "\03{red}" + "Page [[{reddisc}]], ".format(
                            reddisc=redpage.page.title() ) +
                        "containing no redfam, parsed!",
                        "WARNING" )