Merge branch 'fs#70-refactoring' into test-v3

2016-08-27 19:40:00 +02:00
parent 5d31bdd7eb 449d83d7b5
commit eaa7596a8f
8 changed files with 391 additions and 182 deletions
--- a/bots/init.py
+++ b/bots/init.py
@@ -0,0 +1,2 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
--- a/bots/reddiscparser.py
+++ b/bots/reddiscparser.py
@@ -0,0 +1,178 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
 #
 #  reddiscparser.py
 #
 #  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #
 #
 """
 Bot to parse all reddisc pages in given Generator or configured categories
 """
 import re
 import pywikibot  # noqa
 from pywikibot import pagegenerators  # noqa
 from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
 import jogobot
 from lib import redpage
 from lib import redfam
 class DiscussionParserBot(
        # CurrentPageBot,  # via next two sets 'current_page' on each treat()
        ExistingPageBot,  # CurrentPageBot only treats existing pages
        NoRedirectPageBot ):  # class which only treats non-redirects
    """
    Botclass witch initialises the parsing process of Redundancy Discussions
    """
    # RegEx to filter wrong pages
    onlyinclude_re = re.compile(
        jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )
    def __init__( self, genFactory, **kwargs ):
        """
        Constructor
        Parameters:
            @param  genFactory  GenFactory with parsed pagegenerator args to
                                build generator
            @type  genFactory  pagegenerators.GeneratorFactory
            @param  **kwargs  Additional args
            @type  iterable
        """
        # Copy needed args
        self.genFactory = genFactory
        # Build generator with genFactory
        self.build_generator()
        # Run super class init with builded generator
        super( DiscussionParserBot, self ).__init__(generator=self.gen)
    def build_generator(self):
        """
        Builds generator to work on, based on self.genFactory
        """
        # Check wether there are generators waiting for factoring, if not
        # use configured categories
        if not self.genFactory.gens:
            self.apply_conf_cat_generators()
        # Create combined Generator (Union of all Generators)
        gen = self.genFactory.getCombinedGenerator()
        if gen:
            # The preloading generator is responsible for downloading multiple
            # pages from the wiki simultaneously.
            self.gen = pagegenerators.PreloadingGenerator(gen)
        else:
            pywikibot.showHelp()
    def apply_conf_cat_generators( self ):
        """
        Builds generators for categories which are read from jogobot.config
        Parameters:
            @param genFactory: The GeneratorFactory to which the builded
                               generators should be added.
            @type genFactory: pagegenerators.GeneratorFactory
        """
        # Create Generators for configured Categories
        for category in jogobot.config["redundances"]["redpage_cats"]:
            gen = self.genFactory.getCategoryGen(
                category, gen_func=pagegenerators.CategorizedPageGenerator)
            # If there is one, append to genFactory
            if gen:
                self.genFactory.gens.append(gen)
            # Reset gen for next iteration
            gen = None
    def run( self ):
        """
        Controls the overal parsing process, using super class for page switch
        Needed to do things before/after treating pages is done
        """
        try:
            super( DiscussionParserBot, self ).run()
        except:
            raise
        else:
            # If successfully parsed all pages in cat, flush db write cache
            redpage.RedPage.flush_db_cache()
    def treat_page( self ):
        """
        Handles work on current page
        """
        # Short circuit excluded pages
        if self.current_page.title() in (
                jogobot.config["redundances"]["redpage_exclude"] ):
            return
        # Exclude pages which does not match pattern
        if not type(self).onlyinclude_re.search( self.current_page.title() ):
            return
        # Initiate RedPage object
        red_page = redpage.RedPage( self.current_page )
        # Check whether parsing is needed
        if red_page.is_parsing_needed():
            # Count families for failure analysis
            fam_counter = 0
            # Iterate over returned generator with redfam sections
            for fam in red_page.parse():
                # Run RedFamParser on section text
                redfam.RedFamParser.parser( fam, red_page.page,
                                            red_page.is_archive() )
                fam_counter += 1
            else:
                # If successfully parsed whole page, flush
                # db write cache
                if( fam_counter ):
                    redfam.RedFamParser.flush_db_cache()
                    jogobot.output( "Page [[{reddisc}]] parsed".format(
                        reddisc=red_page.page.title() ) )
                else:
                    jogobot.output(
                        "\03{red}" + "Page [[{reddisc}]], ".format(
                            reddisc=red_page.page.title() ) +
                        "containing no redfam, parsed!",
                        "WARNING" )
--- a/2
+++ b/2
--- a/lib/mysqlred.py
+++ b/lib/mysqlred.py
@@ -33,6 +33,7 @@ except ImportError:
 import atexit
 import pywikibot
 from pywikibot import config
 import jogobot
@@ -53,6 +54,7 @@ class MysqlRed:
    db_username = config.db_username
    db_password = config.db_password
    db_name = config.db_username + jogobot.config['db_suffix']
    db_table_prefix = False
    # Class variables for storing cached querys
    _cached_update_data = []
@@ -67,6 +69,14 @@ class MysqlRed:
        @returns     mysql-stream    MySQL Connection
        """
        # Needs to be generated after Parsing of Args (not at import time)
        if not type(self).db_table_prefix:
            type(self).db_table_prefix = \
                pywikibot.Site().family.dbName(pywikibot.Site().code)
        # Now we can setup prepared queries
        self._prepare_queries()
        # Connect to mysqldb only once
        if not type( self ).connection:
@@ -87,11 +97,23 @@ class MysqlRed:
        type( self ).connection.close()
    def _prepare_queries( self ):
        """
        Used to replace placeholders in prepared queries
        """
        type(self)._update_query = type(self)._update_query.format(
            prefix=type(self).db_table_prefix)
        type(self)._insert_query = type(self)._insert_query.format(
            prefix=type(self).db_table_prefix)
    @classmethod
    def flush( cls ):
        """
        Run cached querys
        """
        if not cls.connection:
            raise MysqlRedConnectionError( "No connection exists!" )
        cursor = cls.connection.cursor()
        # Execute insert query
@@ -132,12 +154,13 @@ class MysqlRedPage( MysqlRed ):
    """
    # Class variables for storing cached querys
    # '{prefix}' will be replaced during super().__init__()
    _cached_update_data = []
-    _update_query = 'UPDATE `red_pages` \
+    _update_query = 'UPDATE `{prefix}_red_pages` \
 SET `page_title` = ?, `rev_id` = ?, `status`= ? WHERE `page_id` = ?;'
    _cached_insert_data = {}
-    _insert_query = 'INSERT INTO `red_pages` \
+    _insert_query = 'INSERT INTO `{prefix}_red_pages` \
 ( page_id, page_title, rev_id, status ) VALUES ( ?, ?, ?, ? );'
    def __init__( self, page_id ):
@@ -166,8 +189,10 @@ SET `page_title` = ?, `rev_id` = ?, `status`= ? WHERE `page_id` = ?;'
        cursor = type( self ).connection.cursor(mysqldb.DictCursor)
-        cursor.execute( 'SELECT * FROM `red_pages` WHERE `page_id` = ?;',
+        cursor.execute(
-                        ( self.__page_id, ) )
+            'SELECT * FROM `{prefix}_red_pages` WHERE `page_id` = ?;'.format(
                prefix=type(self).db_table_prefix), ( self.__page_id, ) )
        res = cursor.fetchone()
        if res:
@@ -218,12 +243,11 @@ class MysqlRedFam( MysqlRed ):
    # Class variables for storing cached querys
    _cached_update_data = []
-    _update_query = 'UPDATE `red_families` \
+    _update_query = 'UPDATE `{prefix}_red_families` \
 SET `red_page_id` = ?, `heading` = ?, `beginning` = ?, `ending` = ?, \
 `status`= ? WHERE `fam_hash` = ?;'
    _cached_insert_data = {}
-    _insert_query = 'INSERT INTO `red_families` \
+    _insert_query = 'INSERT INTO `{prefix}_red_families` \
 ( fam_hash, red_page_id, beginning, ending, status, heading, \
 article0, article1, article2, article3, article4, article5, article6, \
 article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );'
@@ -249,8 +273,10 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );'
        cursor = type( self ).connection.cursor( mysqldb.DictCursor )
-        cursor.execute( 'SELECT * FROM `red_families` WHERE `fam_hash` = ?;',
+        cursor.execute(
-                        ( fam_hash, ) )
+            'SELECT * FROM `{prefix}_red_families` WHERE `fam_hash` = ?;'.
            format( prefix=type(self).db_table_prefix), ( fam_hash, ) )
        self.data = cursor.fetchone()
    def add_fam( self, articlesList, heading, red_page_id,
@@ -298,8 +324,9 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );'
        cursor = type( self ).connection.cursor( mysqldb.DictCursor )
-        cursor.execute( 'SELECT * FROM `red_families` WHERE `status` = ?;',
+        cursor.execute(
-                        ( status, ) )
+            'SELECT * FROM `{prefix}_red_families` WHERE `status` = ?;'.format(
                prefix=type( self ).db_table_prefix), ( status, ) )
        while True:
            res = cursor.fetchmany( 1000 )
@@ -307,3 +334,17 @@ article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );'
                break
            for row in res:
                yield row
 class MysqlRedError(Exception):
    """
    Basic Exception class for this module
    """
    pass
 class MysqlRedConnectionError(MysqlRedError):
    """
    Raised if there are Errors with Mysql-Connections
    """
    pass
--- a/lib/redfam.py
+++ b/lib/redfam.py
@@ -35,7 +35,7 @@ import pywikibot  # noqa
 from pywikibot.tools import deprecated  # noqa
 import jogobot
-from mysqlred import MysqlRedFam
+from lib.mysqlred import MysqlRedFam
 class RedFam:
@@ -137,14 +137,14 @@ class RedFamParser( RedFam ):
                     wurde gewünscht von:"
    __done_notice2 = "{{Erledigt|"
-    def __init__( self, heading, red_page_id, red_page_archive,
+    def __init__( self, heading, red_page, red_page_archive,
                  beginning, ending=None ):
        """
        Creates a RedFam object based on data collected while parsing red_pages
        combined with possibly former known data from db
        @param    red_fam_heading     str         Wikitext heading of section
-        @param    red_page_id         int         MediaWiki page_id
+        @param    red_page            page        Pywikibot.page object
        @param    red_page_archive    bool        Is red_page an archive
        @param    beginning           datetime    Timestamp of beginning
                                      str         as strptime parseable string
@@ -153,7 +153,7 @@ class RedFamParser( RedFam ):
        """
        # Set object attributes:
-        self._red_page_id = red_page_id
+        self._red_page_id = red_page._pageid
        self._red_page_archive = red_page_archive
        self._fam_hash = None
@@ -210,13 +210,14 @@ class RedFamParser( RedFam ):
        @type  heading  wikicode or mwparser-parseable
        """
        # Parse heading with mwparse if needed
        if not isinstance( heading, mwparser.wikicode.Wikicode ):
            heading = mwparser.parse( heading )
        # Save heading as string
        self._heading = str( heading )
        # Parse string heading with mwparse again everytime
        # In some cases the given wikicode is broken due to syntax errors
        # (Task FS#77)
        heading = mwparser.parse( self._heading )
        # Save destinations of wikilinks in headings
        self._articlesList = [ str( link.title ) for link
                               in heading.ifilter_wikilinks() ]
@@ -325,23 +326,23 @@ class RedFamParser( RedFam ):
                                         self._status )
    @classmethod
-    @deprecated
+    def is_section_redfam_cb( cls, heading ):
    def is_sectionheading( cls, line ):
        """
-        Checks wether given line is a red_fam section heading
+        Used as callback for wikicode.get_sections in redpage.parse to
-
+        select sections which are redfams
        @param      str     line    String to check
        @returns    bool            Returns True if it is a section heading
        """
        # Because of strange behavior in some cases, parse heading again
        # (Task FS#77)
        heading = mwparser.parse( str( heading ) )
-        if cls.__sectionhead_pat.search( str(line) ):
+        # Make sure we have min. two wikilinks in heading to assume a redfam
        if len( heading.filter_wikilinks() ) >= 2:
            return True
        else:
            return False
    @classmethod
-    def parser( cls, text, pageid, isarchive=False ):
+    def parser( cls, text, page, isarchive=False ):
        """
        Handles parsing of redfam section
@@ -359,8 +360,21 @@ class RedFamParser( RedFam ):
        # Extract beginnig and maybe ending
        (beginning, ending) = RedFamParser.extract_dates( text, isarchive )
        # Missing beginning (Task: FS#76)
        # Use first day of month of reddisc
        if not beginning:
            match = re.search(
                jogobot.config["redundances"]["reddiscs_onlyinclude_re"],
                page.title() )
            if match:
                beginning = datetime.strptime(
                    "01. {month} {year}".format(
                        month=match.group(1), year=match.group(2)),
                    "%d. %B %Y" )
        # Create the RedFam object
-        RedFamParser( heading, pageid, isarchive, beginning, ending )
+        RedFamParser( heading, page, isarchive, beginning, ending )
    @classmethod
    def extract_dates( cls, text, isarchive=False ):
@@ -401,51 +415,13 @@ class RedFamParser( RedFam ):
            else:
                ending = None
        # Missing dates (Task: FS#76)
        else:
            beginning = None
            ending = None
        return (beginning, ending)
    @classmethod
    @deprecated( 'extract_dates' )
    def is_beginning( cls, line ):
        """
        Returns the first timestamp found in line, otherwise None
        @param      str    line    String to search in
        @returns    str            Timestamp, otherwise None
        """
        return cls.extract_dates( line )[0]
    @classmethod
    @deprecated( 'extract_dates' )
    def is_ending( cls, line, isarchive=False ):
        """
        Returns the timestamp of done notice ( if one ), otherwise None
        @param  line    String to search in
        @type  line  str
        @param  isarchive  If true skip searching done notice (on archivepages)
        @type  isarchive  bool
        @returns  Timestamp, otherwise None
        @returntype  str
        """
        return cls.extract_dates( line )[1]
    @classmethod
    @deprecated( 'extract_dates' )
    def is_ending2( cls, line ):
        """
        Returns the last timestamp found in line, otherwise None
        @param      str    line    String to search in
        @returns    str            Timestamp, otherwise None
        """
        return cls.extract_dates( line, True )[1]
 class RedFamWorker( RedFam ):
    """
--- a/lib/redpage.py
+++ b/lib/redpage.py
@@ -28,9 +28,10 @@ Provides a class for handling redundance discussion pages and archives
 import pywikibot  # noqa
 import mwparserfromhell as mwparser
-import jogobot
+import jogobot  # noqa
-from mysqlred import MysqlRedPage
+from lib.mysqlred import MysqlRedPage
 from lib.redfam import RedFamParser
 class RedPage:
@@ -116,7 +117,7 @@ class RedPage:
        # include_lead = if true include first section (intro)
        # include_heading = if true include heading
        fams = self.wikicode.get_sections(
-            matches=jogobot.config["redundances"]["section_heading_regex"],
+            matches=RedFamParser.is_section_redfam_cb,
            include_lead=False, include_headings=True )
        # Iterate over RedFam
--- a/parse-pages.py
+++ b/parse-pages.py
@@ -1,107 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
 #
 #  parse-pages.py
 #
 #  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #
 #
 """
 Script to parse all redpages in configured categories
 """
 import pywikibot
 from pywikibot import pagegenerators
 import jogobot
 import redpage
 import redfam
 def get_cat_pages( cat ):
    """
    Generates a iteratable generator-object with all pages listet in given
    category
    @param  cat  Category to request
    @type  cat  str
    @returns  generator  Iteratable object with pages of given category
    """
    # Get site to work on from pywikibot config
    site = pywikibot.Site()
    # Retrieve the content of given category
    category = pywikibot.Category( site, cat )
    # Build an iteratable generator object with page objects for given category
    generator = pagegenerators.CategorizedPageGenerator( category )
    return generator
 def main(*args):
    """
    Handles process
    """
    try:
        jogobot.output( "BEGINN – parser-pages.py" )
        # Iterate over configured categories
        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
            # Iterate over pages in current cat
            for page in get_cat_pages( cat ):
                # For pages configured to exclude, go on with next page
                if page.title() in (
                        jogobot.config["redundances"]["redpage_exclude"] ):
                    continue
                # Initiate RedPage object
                red_page = redpage.RedPage( page )
                # Check whether parsing is needed
                if red_page.is_parsing_needed():
                    # Iterate over returned generator with redfam sections
                    for fam in red_page.parse():
                        # Run RedFamParser on section text
                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
                                                    red_page.is_archive() )
                    else:
                        # If successfully parsed whole page, flush
                        # db write cache
                        redfam.RedFamParser.flush_db_cache()
                        jogobot.output( "Page '%s' parsed" %
                                        red_page.page.title() )
            else:
                # If successfully parsed all pages in cat, flush db write cache
                redpage.RedPage.flush_db_cache()
    finally:
        jogobot.output( "END – parser-pages.py" )
        pywikibot.stopme()
 if( __name__ == "__main__" ):
    main()
--- a/red.py
+++ b/red.py
@@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
 #
 #  reddiscparser.py
 #
 #  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #
 #
 """
 Wrapper script to invoke all redundances bot tasks
 """
 import os
 import pywikibot
 import jogobot
 def prepare_bot( task_slug, subtask, genFactory, subtask_args ):
    """
    Handles importing subtask Bot class and prepares specific args
    Throws exception if bot not exists
    @param  task_slug  Task slug, needed for logging
    @type task_slug  str
    @param  subtask  Slug of given subtask
    @type  subtask  str
    @param  genFactory  GenFactory with parsed pagegenerator args
    @type  genFactory  pagegenerators.GeneratorFactory
    @param  subtask_args  Additional args for subtasks
    @type  subtask_args  dict\
    @returns  The following tuple
        @return 1  Subtask slug (replaced None for default)
        @rtype  str
        @return 2  Botclass of given subtask (Arg "-task")
        @rtype  Class
        @return 3  GenFactory with parsed pagegenerator args
        @rtype  pagegenerators.GeneratorFactory
        @return 4  Additional args for subtasks
        @rtype  dict
    @rtype  tuple
    """
    # kwargs are passed to selected bot as **kwargs
    kwargs = dict()
    if not subtask or subtask == "discparser":
        # Default case: discparser
        subtask = "discparser"
        # Import related bot
        from bots.reddiscparser import DiscussionParserBot as Bot
    # Subtask error
    else:
        jogobot.output( (
            "\03{{red}} Given subtask \"{subtask} \"" +
            "is not existing!" ).format( subtask=subtask ), "ERROR" )
        raise Exception
    return ( subtask, Bot, genFactory, kwargs )
 def main(*args):
    """
    Process command line arguments and invoke bot.
    If args is an empty list, sys.argv is used.
    @param args: command line arguments
    @type args: list of unicode
    """
    # Process global arguments to determine desired site
    local_args = pywikibot.handle_args(args)
    # Get the jogobot-task_slug (basename of current file without ending)
    task_slug = os.path.basename(__file__)[:-len(".py")]
    # Disabled until [FS#86] is done
    # Before run, we need to check wether we are currently active or not
    # if not jogobot.bot.active( task_slug ):
    #     return
    # Parse local Args to get information about subtask
    ( subtask, genFactory, subtask_args ) = jogobot.bot.parse_local_args(
        local_args )
    # select subtask and prepare args
    ( subtask, Bot, genFactory, kwargs ) = prepare_bot(
        task_slug, subtask, genFactory, subtask_args )
    # Init Bot
    bot = jogobot.bot.init_bot( task_slug, subtask, Bot, genFactory, **kwargs)
    # Run bot
    jogobot.bot.run_bot( task_slug, subtask, bot )
 if( __name__ == "__main__" ):
    main()
		`@@ -0,0 +1,2 @@`
							`#!/usr/bin/env python3`
							`# -- coding: utf-8 --`