Merge branch 'restucture-parsers' into test-v3

2016-03-05 15:02:31 +01:00
parent baf4ae2a07 7cac294181
commit 944bea488a
4 changed files with 342 additions and 225 deletions
--- a/parse-pages.py
+++ b/parse-pages.py
@@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
 #
 #  parse-pages.py
 #
 #  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #
 #
 """
 Script to parse all redpages in configured categories
 """
 import pywikibot
 from pywikibot import pagegenerators
 import jogobot
 import redpage
 import redfam
 def get_cat_pages( cat ):
    """
    Generates a iteratable generator-object with all pages listet in given
    category
    @param  cat  Category to request
    @type  cat  str
    @returns  generator  Iteratable object with pages of given category
    """
    # Get site to work on from pywikibot config
    site = pywikibot.Site()
    # Retrieve the content of given category
    category = pywikibot.Category( site, cat )
    # Build an iteratable generator object with page objects for given category
    generator = pagegenerators.CategorizedPageGenerator( category )
    return generator
 def main(*args):
    """
    Handles process
    """
    try:
        jogobot.output( "BEGINN – parser-pages.py" )
        # Iterate over configured categories
        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
            # Iterate over pages in current cat
            for page in get_cat_pages( cat ):
                # For pages configured to exclude, go on with next page
                if page.title() in (
                        jogobot.config["redundances"]["redpage_exclude"] ):
                    continue
                # Initiate RedPage object
                red_page = redpage.RedPage( page )
                # Check whether parsing is needed
                if red_page.is_parsing_needed():
                    # Iterate over returned generator with redfam sections
                    for fam in red_page.parse():
                        # Run RedFamParser on section text
                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
                                                    red_page.is_archive() )
                    else:
                        # If successfully parsed whole page, flush
                        # db write cache
                        redfam.RedFamParser.flush_db_cache()
                        jogobot.output( "Page '%s' parsed" %
                                        red_page.page.title() )
            else:
                # If successfully parsed all pages in cat, flush db write cache
                redpage.RedPage.flush_db_cache()
    finally:
        jogobot.output( "END – parser-pages.py" )
        pywikibot.stopme()
 if( __name__ == "__main__" ):
    main()
--- a/redfam.py
+++ b/redfam.py
@@ -30,7 +30,9 @@ import locale
 import re
 from datetime import datetime
-import pywikibot
+import mwparserfromhell as mwparser  # noqa
 import pywikibot  # noqa
 from pywikibot.tools import deprecated  # noqa
 import jogobot
 from mysqlred import MysqlRedFam
@@ -124,7 +126,7 @@ class RedFamParser( RedFam ):
    __timestamp_format = jogobot.config['redundances']['timestamp_format']
    # Define section heading re.pattern
-    __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" )
+    __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" )
    # Define timestamp re.pattern
    __timestamp_pat = re.compile( jogobot.config['redundances']
@@ -203,36 +205,36 @@ class RedFamParser( RedFam ):
    def heading_parser( self, heading ):
        """
        Parses given red_fam_heading string and saves articles list
        @param  heading  Heading of RedFam-Section
        @type  heading  wikicode or mwparser-parseable
        """
-        # Predefine a pattern for wikilinks' destination
+        # Parse heading with mwparse if needed
-        wikilink_pat = re.compile( r"\[\[([^\[\]\|]+)(?:\]\]|\|)" )
+        if not isinstance( heading, mwparser.wikicode.Wikicode ):
            heading = mwparser.parse( heading )
-        # Parse content of heading for generating section links later
+        # Save heading as string
-        match = type( self ).__sectionhead_pat.search( heading )
+        self._heading = str( heading )
        if match:
            self._heading = match.group(2).strip()
        else:
            raise RedFamHeadingError( heading )
-        # We get the pages in first [0] element iterating over
+        # Save destinations of wikilinks in headings
-        # wikilink_pat.findall( line )
+        self._articlesList = [ str( link.title ) for link
-        # Strip leading and trailing whitespace in Links to prevent wrong
+                               in heading.ifilter_wikilinks() ]
        # fam_hashes (when receiving redfam from db) since MySQL drops it
        self._articlesList = [ link.strip() for link
                               in wikilink_pat.findall( self._heading ) ]
        # Catch sections with more then 8 articles, print error
        if len( self._articlesList ) > 8:
            # For repression in output we need to know the fam hash
            self.calc_fam_hash()
-            pywikibot.output( "\
+            jogobot.output(
-{datetime} – \03{{lightred}}[WARNING] – \
+                ( "\03{{lightred}}" +
-Maximum number of articles in red_fam exceeded, maximum number is 8, \
+                  "Maximum number of articles in red_fam exceeded, " +
-{number:d} were given \n {repress}".format(
+                  "maximum number is 8, {number:d} were given \n {repress}"
-                datetime=datetime.now().strftime( "%Y-%m-%d %H:%M:%S" ),
+                  ).format( datetime=datetime.now().strftime(
-                number=len( self._articlesList ), repress=repr( self ) ) )
+                      "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
                    repress=repr( self ) ),
                "WARNING" )
            # Only save the first 8 articles
            self._articlesList = self._articlesList[:8]
    def add_beginning( self, beginning ):
@@ -323,6 +325,7 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
                                         self._status )
    @classmethod
    @deprecated
    def is_sectionheading( cls, line ):
        """
        Checks wether given line is a red_fam section heading
@@ -332,12 +335,77 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
        @returns    bool            Returns True if it is a section heading
        """
-        if cls.__sectionhead_pat.search( line ):
+        if cls.__sectionhead_pat.search( str(line) ):
            return True
        else:
            return False
    @classmethod
    def parser( cls, text, pageid, isarchive=False ):
        """
        Handles parsing of redfam section
        @param  text  Text of RedFam-Section
        @type  text  wikicode or mwparser-parseable
        """
        # Parse heading with mwparse if needed
        if not isinstance( text, mwparser.wikicode.Wikicode ):
            text = mwparser.parse( text )
        # Extract heading text
        heading = next( text.ifilter_headings() ).title
        # Extract beginnig and maybe ending
        (beginning, ending) = RedFamParser.extract_dates( text, isarchive )
        # Create the RedFam object
        RedFamParser( heading, pageid, isarchive, beginning, ending )
    @classmethod
    def extract_dates( cls, text, isarchive=False ):
        """
        Returns tuple of the first and maybe last timestamp of a section.
        Last timestamp is only returned if there is a done notice or param
        *isarchiv* is set to 'True'
        @param  text  Text to search in
        @type  line  Any Type castable to str
        @param  isarchive  If true skip searching done notice (on archivepages)
        @type  isarchive  bool
        @returns  Timestamps, otherwise None
        @returntype  tuple of strs
        """
        # Match all timestamps
        matches = cls.__timestamp_pat.findall( str( text ) )
        if matches:
            # First one is beginning
            # Since some timestamps are broken we need to reconstruct them
            # by regex match groups
            beginning = ( matches[0][0] + ", " + matches[0][1] + ". " +
                          matches[0][2] + ". " + matches[0][3] )
            # Last one maybe is ending
            # Done notice format 1
            # Done notice format 2
            # Or on archivepages
            if ( cls.__done_notice in text or
                 cls.__done_notice2 in text or
                 isarchive ):
                ending = ( matches[-1][0] + ", " + matches[-1][1] + ". " +
                           matches[-1][2] + ". " + matches[-1][3] )
            else:
                ending = None
        return (beginning, ending)
    @classmethod
    @deprecated( extract_dates )
    def is_beginning( cls, line ):
        """
        Returns the first timestamp found in line, otherwise None
@@ -347,36 +415,27 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
        @returns    str            Timestamp, otherwise None
        """
-        match = cls.__timestamp_pat.search( line )
+        return cls.extract_dates( line )[0]
        if match:
            # Since some timestamps are broken we need to reconstruct them
            # by regex match groups
            result = match.group(1) + ", " + match.group(2) + ". " +\
                match.group(3) + ". " + match.group(4)
            return result
        else:
            return None
    @classmethod
-    def is_ending( cls, line ):
+    @deprecated( extract_dates )
    def is_ending( cls, line, isarchive=False ):
        """
        Returns the timestamp of done notice ( if one ), otherwise None
        @param      str    line    String to search in
-        @returns    str            Timestamp, otherwise None
+        @param  line    String to search in
        @type  line  str
        @param  isarchive  If true skip searching done notice (on archivepages)
        @type  isarchive  bool
        @returns  Timestamp, otherwise None
        @returntype  str
        """
-        if ( cls.__done_notice in line ) or ( cls.__done_notice2 in line ):
+        return cls.extract_dates( line )[1]
            match = cls.__timestamp_pat.search( line )
            if match:
                # Since some timestamps are broken we need to reconstruct them
                # by regex match groups
                result = match.group(1) + ", " + match.group(2) + ". " +\
                    match.group(3) + ". " + match.group(4)
                return result
        return None
    @classmethod
    @deprecated( extract_dates )
    def is_ending2( cls, line ):
        """
        Returns the last timestamp found in line, otherwise None
@@ -385,15 +444,7 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
        @returns    str            Timestamp, otherwise None
        """
-        matches = cls.__timestamp_pat.findall( line )
+        return cls.extract_dates( line, True )[1]
        if matches:
            # Since some timestamps are broken we need to reconstruct them
            # by regex match groups
            result = matches[-1][0] + ", " + matches[-1][1] + ". " +\
                matches[-1][2] + ". " + matches[-1][3]
            return result
        else:
            return None
 class RedFamWorker( RedFam ):
--- a/redpage.py
+++ b/redpage.py
@@ -26,9 +26,11 @@ Provides a class for handling redundance discussion pages and archives
 """
 import pywikibot  # noqa
 import mwparserfromhell as mwparser
 import jogobot
 from mysqlred import MysqlRedPage
 from redfam import RedFamParser
 class RedPage:
@@ -52,10 +54,6 @@ class RedPage:
        self.is_page_changed()
        self._parsed = None
        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
            self.parse()
        self.__update_db()
    def __handle_db( self ):
        """
@@ -94,78 +92,41 @@ class RedPage:
        else:
                return False
-    def parse( self ):  # noqa
+    def is_parsing_needed( self ):
        """
        Decides wether current RedPage needs to be parsed or not
        """
        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
            return True
        else:
            return False
    def parse( self ):
        """
        Handles the parsing process
        """
-        # Since @param text is a string we need to split it in lines
+        # Generate Wikicode object
-        text_lines = self.page.text.split( "\n" )
+        self.wikicode = mwparser.parse( self.page.text )
        length = len( text_lines )
-        # Initialise line counter
+        # Select RedFam-sections
-        i = 0
+        # matches=Regexp or
-        fam_heading = None
+        #         function( gets heading content as wikicode as param 1)
-        beginning = None
+        # include_lead = if true include first section (intro)
-        ending = None
+        # include_heading = if true include heading
        fams = self.wikicode.get_sections(
            matches=jogobot.config["redundances"]["section_heading_regex"],
            include_lead=False, include_headings=True )
-        # Set line for last detected Redundance-Family to 0
+        # Iterate over RedFam
-        last_fam = 0
+        for fam in fams:
-        # Iterate over the lines of the page
+            yield fam
        for line in text_lines:
            # Check wether we have an "Redundance-Family"-Section heading
            if RedFamParser.is_sectionheading( line ):
                # Save line number for last detected Redundance-Family
                last_fam = i
                # Save heading
                fam_heading = line
                # Defined (re)initialisation of dates
                beginning = None
                ending = None
            # Check wether we are currently in an "Redundance-Family"-Section
            if i > last_fam and last_fam > 0:
                # Check if we have alredy recognized the beginning date of the
                # discussion (in former iteration) or if we have a done-notice
                if not beginning:
                    beginning = RedFamParser.is_beginning( line )
                elif not ending:
                    ending = RedFamParser.is_ending( line )
            # Detect end of red_fam section (next line is new sectionheading)
            # or end of file
            # Prevent from running out of index
            if i < (length - 1):
                test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] )
        else:
                test = False
            if ( test or ( length == ( i + 1 ) ) ):
                # Create the red_fam object
                if( fam_heading and beginning ):
                    # Maybe we can find a ending by feed if we have None yet
                    # (No done notice on archive pages)
                    if not ending and self.is_archive():
                        j = i
                        while (j > last_fam) and not ending:
                            j -= 1
                            ending = RedFamParser.is_ending2( text_lines[ j ] )
                    # Create the RedFam object
                    RedFamParser( fam_heading, self.page._pageid,
                                  self.is_archive(), beginning, ending )
            # Increment line counter
            i += 1
        else:
            RedFamParser.flush_db_cache()
            self._parsed = True
            self.__update_db()
    def __update_db( self ):
        """
--- a/tox.ini
+++ b/tox.ini
@@ -1,2 +0,0 @@
 [flake8]
 ignore = E129,E201,E202,W293