jogobot-red/bots/reddiscparser.py

#!/usr/bin/env python3
# -*- coding: utf-8  -*-
#
#  reddiscparser.py
#
#  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#
"""
Bot to parse all reddisc pages in given Generator or configured categories
"""

import re

import pywikibot  # noqa
from pywikibot import pagegenerators  # noqa
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot

import jogobot

from lib import redpage
from lib import redfam


class DiscussionParserBot(
        # CurrentPageBot,  # via next two sets 'current_page' on each treat()
        ExistingPageBot,  # CurrentPageBot only treats existing pages
        NoRedirectPageBot ):  # class which only treats non-redirects
    """
    Botclass witch initialises the parsing process of Redundancy Discussions
    """

    # RegEx to filter wrong pages
    onlyinclude_re = re.compile(
        jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )

    def __init__( self, generator ):
        """
        Constructor

        Parameters:
            @param generator: The page generator that determines on which pages
                              to work.
            @type generator: generator.
        """

    def build_generator(self):
        """
        Builds generator to work on, based on self.genFactory
        """
        # Check wether there are generators waiting for factoring, if not
        # use configured categories
        if not self.genFactory.gens:
            self.apply_conf_cat_generators()

        # Create combined Generator (Union of all Generators)
        gen = self.genFactory.getCombinedGenerator()

        if gen:
            # The preloading generator is responsible for downloading multiple
            # pages from the wiki simultaneously.
            self.gen = pagegenerators.PreloadingGenerator(gen)

        else:
            pywikibot.showHelp()

    def apply_conf_cat_generators( self ):
        """
        Builds generators for categories which are read from jogobot.config

        Parameters:
            @param genFactory: The GeneratorFactory to which the builded
                               generators should be added.
            @type genFactory: pagegenerators.GeneratorFactory
        """
        # Create Generators for configured Categories
        for category in jogobot.config["redundances"]["redpage_cats"]:
            gen = self.genFactory.getCategoryGen(
                category, gen_func=pagegenerators.CategorizedPageGenerator)

            # If there is one, append to genFactory
            if gen:
                self.genFactory.gens.append(gen)

            # Reset gen for next iteration
            gen = None

    def run( self ):
        """
        Controls the overal parsing process, using super class for page switch

        Needed to do things before/after treating pages is done
        """
        try:

            super( DiscussionParserBot, self ).run()

        except:
            raise

        else:

            # If successfully parsed all pages in cat, flush db write cache
            redpage.RedPage.flush_db_cache()

    def treat_page( self ):
        """
        Handles work on current page
        """

        # Short circuit excluded pages
        if self.current_page.title() in (
                jogobot.config["redundances"]["redpage_exclude"] ):

            return

        # Exclude pages which does not match pattern
        if not type(self).onlyinclude_re.search( self.current_page.title() ):

            return

        # Initiate RedPage object
        red_page = redpage.RedPage( self.current_page )

        # Check whether parsing is needed
        if red_page.is_parsing_needed():

            # Count families for failure analysis
            fam_counter = 0

            # Iterate over returned generator with redfam sections
            for fam in red_page.parse():

                # Run RedFamParser on section text
                redfam.RedFamParser.parser( fam, red_page.page,
                                            red_page.is_archive() )

                fam_counter += 1

            else:
                # If successfully parsed whole page, flush
                # db write cache
                if( fam_counter ):
                    redfam.RedFamParser.flush_db_cache()
                    jogobot.output( "Page [[{reddisc}]] parsed".format(
                        reddisc=red_page.page.title() ) )
                else:
                    jogobot.output(
                        "\03{red}" + "Page [[{reddisc}]], ".format(
                            reddisc=red_page.page.title() ) +
                        "containing no redfam, parsed!",
                        "WARNING" )
Add parse-pages.py Script 8 years ago			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`#`
Correct filename in header Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# reddiscparser.py`
Add parse-pages.py Script 8 years ago			`#`
			`# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation; either version 2 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,`
			`# MA 02110-1301, USA.`
			`#`
			`#`
			`"""`
Reflect stucture changes in Code Since bot class is moved to separate dir/file we need to do some changes to rebuild functionality Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=82 FS#82] 8 years ago			`Bot to parse all reddisc pages in given Generator or configured categories`
Add parse-pages.py Script 8 years ago			`"""`

Check reddisc page titles against regex To prevent parsing Pages which have been categorized in configured cats wrong or are given via cmd params Parsing them results in unexpected behaviour Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`import re`
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago
Reflect stucture changes in Code Since bot class is moved to separate dir/file we need to do some changes to rebuild functionality Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=82 FS#82] 8 years ago			`import pywikibot # noqa`
			`from pywikibot import pagegenerators # noqa`
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`from pywikibot.bot import ExistingPageBot, NoRedirectPageBot`
Add parse-pages.py Script 8 years ago
			`import jogobot`

Introduce new directory structure To clarify which is a bot and which are helper scripts Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=74 FS#74] 8 years ago			`from lib import redpage`
			`from lib import redfam`
Add parse-pages.py Script 8 years ago

Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`class DiscussionParserBot(`
			`# CurrentPageBot, # via next two sets 'current_page' on each treat()`
			`ExistingPageBot, # CurrentPageBot only treats existing pages`
			`NoRedirectPageBot ): # class which only treats non-redirects`
			`"""`
			`Botclass witch initialises the parsing process of Redundancy Discussions`
Add parse-pages.py Script 8 years ago			`"""`

Check reddisc page titles against regex To prevent parsing Pages which have been categorized in configured cats wrong or are given via cmd params Parsing them results in unexpected behaviour Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`# RegEx to filter wrong pages`
			`onlyinclude_re = re.compile(`
			`jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )`

Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`def __init__( self, generator ):`
			`"""`
			`Constructor`
Add parse-pages.py Script 8 years ago
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`Parameters:`
			`@param generator: The page generator that determines on which pages`
			`to work.`
			`@type generator: generator.`
			`"""`
Add methods to build gen to DiscussionParser With the new wrapper script the Bot gets a GenFactory and has to build a generator out of it by its own Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=83 FS#83] 8 years ago
			`def build_generator(self):`
			`"""`
			`Builds generator to work on, based on self.genFactory`
			`"""`
			`# Check wether there are generators waiting for factoring, if not`
			`# use configured categories`
			`if not self.genFactory.gens:`
			`self.apply_conf_cat_generators()`

			`# Create combined Generator (Union of all Generators)`
			`gen = self.genFactory.getCombinedGenerator()`

			`if gen:`
			`# The preloading generator is responsible for downloading multiple`
			`# pages from the wiki simultaneously.`
			`self.gen = pagegenerators.PreloadingGenerator(gen)`

			`else:`
			`pywikibot.showHelp()`

			`def apply_conf_cat_generators( self ):`
			`"""`
			`Builds generators for categories which are read from jogobot.config`

			`Parameters:`
			`@param genFactory: The GeneratorFactory to which the builded`
			`generators should be added.`
			`@type genFactory: pagegenerators.GeneratorFactory`
			`"""`
			`# Create Generators for configured Categories`
			`for category in jogobot.config["redundances"]["redpage_cats"]:`
			`gen = self.genFactory.getCategoryGen(`
			`category, gen_func=pagegenerators.CategorizedPageGenerator)`

			`# If there is one, append to genFactory`
			`if gen:`
			`self.genFactory.gens.append(gen)`

			`# Reset gen for next iteration`
			`gen = None`
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago
			`def run( self ):`
			`"""`
			`Controls the overal parsing process, using super class for page switch`

			`Needed to do things before/after treating pages is done`
			`"""`
			`try:`

			`super( DiscussionParserBot, self ).run()`

			`except:`
			`raise`

			`else:`

			`# If successfully parsed all pages in cat, flush db write cache`
			`redpage.RedPage.flush_db_cache()`

			`def treat_page( self ):`
			`"""`
			`Handles work on current page`
			`"""`
Add parse-pages.py Script 8 years ago
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# Short circuit excluded pages`
			`if self.current_page.title() in (`
			`jogobot.config["redundances"]["redpage_exclude"] ):`
Add parse-pages.py Script 8 years ago
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`return`
Add parse-pages.py Script 8 years ago
Check reddisc page titles against regex To prevent parsing Pages which have been categorized in configured cats wrong or are given via cmd params Parsing them results in unexpected behaviour Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`# Exclude pages which does not match pattern`
			`if not type(self).onlyinclude_re.search( self.current_page.title() ):`

			`return`

Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# Initiate RedPage object`
			`red_page = redpage.RedPage( self.current_page )`
Add parse-pages.py Script 8 years ago
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# Check whether parsing is needed`
			`if red_page.is_parsing_needed():`
Add parse-pages.py Script 8 years ago
Make sure only flush db if there are redfams To prevent from doing unnecessary stuff and trying to use not existing db connection Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`# Count families for failure analysis`
			`fam_counter = 0`

Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# Iterate over returned generator with redfam sections`
			`for fam in red_page.parse():`
Add parse-pages.py Script 8 years ago
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`# Run RedFamParser on section text`
Pass reddisc pywikibot.page object to redfam To access page information like page title (eg. to get dates from it) of the reddisc page Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=76 FS#76] 8 years ago			`redfam.RedFamParser.parser( fam, red_page.page,`
Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`red_page.is_archive() )`
Make sure only flush db if there are redfams To prevent from doing unnecessary stuff and trying to use not existing db connection Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago
			`fam_counter += 1`

Rewrite parse control using pywikibot.bot classes To use the default pywikibot.classes making life easier at some point Beeing standardconform with pywikibot in handling args Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72] 8 years ago			`else:`
			`# If successfully parsed whole page, flush`
			`# db write cache`
Make sure only flush db if there are redfams To prevent from doing unnecessary stuff and trying to use not existing db connection Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`if( fam_counter ):`
			`redfam.RedFamParser.flush_db_cache()`
			`jogobot.output( "Page [[{reddisc}]] parsed".format(`
			`reddisc=red_page.page.title() ) )`
			`else:`
			`jogobot.output(`
Prevent flush from creating cursor without con MysqlRed.flush() tried to create a cursor in any case. If there was no connection (because the subclasses haven't been instantiated an oursql Error occured. Instead, check before if there is a connection and otherwise raise an Error Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`"\03{red}" + "Page [[{reddisc}]], ".format(`
Make sure only flush db if there are redfams To prevent from doing unnecessary stuff and trying to use not existing db connection Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] 8 years ago			`reddisc=red_page.page.title() ) +`
			`"containing no redfam, parsed!",`
			`"WARNING" )`