Merge branch 'restucture-parsers' into test-v3

8 years ago · 944bea488a
4 changed files with 342 additions and 225 deletions
--- a/parse-pages.py
+++ b/parse-pages.py
@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8  -*-
+#
+#  parse-pages.py
+#
+#  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#  MA 02110-1301, USA.
+#
+#
+"""
+Script to parse all redpages in configured categories
+"""
+
+import pywikibot
+from pywikibot import pagegenerators
+
+import jogobot
+
+import redpage
+import redfam
+
+
+def get_cat_pages( cat ):
+    """
+    Generates a iteratable generator-object with all pages listet in given
+    category
+
+    @param  cat  Category to request
+    @type  cat  str
+
+    @returns  generator  Iteratable object with pages of given category
+    """
+
+    # Get site to work on from pywikibot config
+    site = pywikibot.Site()
+
+    # Retrieve the content of given category
+    category = pywikibot.Category( site, cat )
+
+    # Build an iteratable generator object with page objects for given category
+    generator = pagegenerators.CategorizedPageGenerator( category )
+
+    return generator
+
+
+def main(*args):
+    """
+    Handles process
+    """
+
+    try:
+        jogobot.output( "BEGINN – parser-pages.py" )
+
+        # Iterate over configured categories
+        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
+
+            # Iterate over pages in current cat
+            for page in get_cat_pages( cat ):
+
+                # For pages configured to exclude, go on with next page
+                if page.title() in (
+                        jogobot.config["redundances"]["redpage_exclude"] ):
+
+                    continue
+
+                # Initiate RedPage object
+                red_page = redpage.RedPage( page )
+
+                # Check whether parsing is needed
+                if red_page.is_parsing_needed():
+
+                    # Iterate over returned generator with redfam sections
+                    for fam in red_page.parse():
+
+                        # Run RedFamParser on section text
+                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
+                                                    red_page.is_archive() )
+                    else:
+                        # If successfully parsed whole page, flush
+                        # db write cache
+                        redfam.RedFamParser.flush_db_cache()
+                        jogobot.output( "Page '%s' parsed" %
+                                        red_page.page.title() )
+            else:
+                # If successfully parsed all pages in cat, flush db write cache
+                redpage.RedPage.flush_db_cache()
+
+    finally:
+        jogobot.output( "END – parser-pages.py" )
+        pywikibot.stopme()
+
+if( __name__ == "__main__" ):
+    main()
--- a/redfam.py
+++ b/redfam.py
@ -30,7 +30,9 @@ import locale
 import re
 from datetime import datetime

-import pywikibot
+import mwparserfromhell as mwparser  # noqa
+import pywikibot  # noqa
+from pywikibot.tools import deprecated  # noqa

 import jogobot
 from mysqlred import MysqlRedFam
@ -124,7 +126,7 @@ class RedFamParser( RedFam ):
    __timestamp_format = jogobot.config['redundances']['timestamp_format']

    # Define section heading re.pattern
-    __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" )
+    __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" )

    # Define timestamp re.pattern
    __timestamp_pat = re.compile( jogobot.config['redundances']
@ -203,36 +205,36 @@ class RedFamParser( RedFam ):
    def heading_parser( self, heading ):
        """
        Parses given red_fam_heading string and saves articles list
+
+        @param  heading  Heading of RedFam-Section
+        @type  heading  wikicode or mwparser-parseable
        """

-        # Predefine a pattern for wikilinks' destination
-        wikilink_pat = re.compile( r"\[\[([^\[\]\|]+)(?:\]\]|\|)" )
+        # Parse heading with mwparse if needed
+        if not isinstance( heading, mwparser.wikicode.Wikicode ):
+            heading = mwparser.parse( heading )

-        # Parse content of heading for generating section links later
-        match = type( self ).__sectionhead_pat.search( heading )
-        if match:
-            self._heading = match.group(2).strip()
-        else:
-            raise RedFamHeadingError( heading )
+        # Save heading as string
+        self._heading = str( heading )

-        # We get the pages in first [0] element iterating over
-        # wikilink_pat.findall( line )
-        # Strip leading and trailing whitespace in Links to prevent wrong
-        # fam_hashes (when receiving redfam from db) since MySQL drops it
-        self._articlesList = [ link.strip() for link
-                               in wikilink_pat.findall( self._heading ) ]
+        # Save destinations of wikilinks in headings
+        self._articlesList = [ str( link.title ) for link
+                               in heading.ifilter_wikilinks() ]

        # Catch sections with more then 8 articles, print error
        if len( self._articlesList ) > 8:
            # For repression in output we need to know the fam hash
            self.calc_fam_hash()
-            pywikibot.output( "\
-{datetime} – \03{{lightred}}[WARNING] – \
-Maximum number of articles in red_fam exceeded, maximum number is 8, \
-{number:d} were given \n {repress}".format(
-                datetime=datetime.now().strftime( "%Y-%m-%d %H:%M:%S" ),
-                number=len( self._articlesList ), repress=repr( self ) ) )
-            
+            jogobot.output(
+                ( "\03{{lightred}}" +
+                  "Maximum number of articles in red_fam exceeded, " +
+                  "maximum number is 8, {number:d} were given \n {repress}"
+                  ).format( datetime=datetime.now().strftime(
+                      "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
+                    repress=repr( self ) ),
+                "WARNING" )
+
+            # Only save the first 8 articles
            self._articlesList = self._articlesList[:8]

    def add_beginning( self, beginning ):
@ -323,6 +325,7 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
                                         self._status )

    @classmethod
+    @deprecated
    def is_sectionheading( cls, line ):
        """
        Checks wether given line is a red_fam section heading
@ -332,51 +335,107 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
        @returns    bool            Returns True if it is a section heading
        """

-        if cls.__sectionhead_pat.search( line ):
+        if cls.__sectionhead_pat.search( str(line) ):
            return True
        else:
            return False

    @classmethod
-    def is_beginning( cls, line ):
+    def parser( cls, text, pageid, isarchive=False ):
        """
-        Returns the first timestamp found in line, otherwise None
+        Handles parsing of redfam section

-        @param      str    line    String to search in
+        @param  text  Text of RedFam-Section
+        @type  text  wikicode or mwparser-parseable
+        """

-        @returns    str            Timestamp, otherwise None
+        # Parse heading with mwparse if needed
+        if not isinstance( text, mwparser.wikicode.Wikicode ):
+            text = mwparser.parse( text )
+
+        # Extract heading text
+        heading = next( text.ifilter_headings() ).title
+
+        # Extract beginnig and maybe ending
+        (beginning, ending) = RedFamParser.extract_dates( text, isarchive )
+
+        # Create the RedFam object
+        RedFamParser( heading, pageid, isarchive, beginning, ending )
+
+    @classmethod
+    def extract_dates( cls, text, isarchive=False ):
+        """
+        Returns tuple of the first and maybe last timestamp of a section.
+        Last timestamp is only returned if there is a done notice or param
+        *isarchiv* is set to 'True'
+
+        @param  text  Text to search in
+        @type  line  Any Type castable to str
+        @param  isarchive  If true skip searching done notice (on archivepages)
+        @type  isarchive  bool
+
+        @returns  Timestamps, otherwise None
+        @returntype  tuple of strs
        """

-        match = cls.__timestamp_pat.search( line )
-        if match:
+        # Match all timestamps
+        matches = cls.__timestamp_pat.findall( str( text ) )
+        if matches:
+
+            # First one is beginning
            # Since some timestamps are broken we need to reconstruct them
            # by regex match groups
-            result = match.group(1) + ", " + match.group(2) + ". " +\
-                match.group(3) + ". " + match.group(4)
-            return result
+            beginning = ( matches[0][0] + ", " + matches[0][1] + ". " +
+                          matches[0][2] + ". " + matches[0][3] )
+
+            # Last one maybe is ending
+            # Done notice format 1
+            # Done notice format 2
+            # Or on archivepages
+            if ( cls.__done_notice in text or
+                 cls.__done_notice2 in text or
+                 isarchive ):
+
+                ending = ( matches[-1][0] + ", " + matches[-1][1] + ". " +
+                           matches[-1][2] + ". " + matches[-1][3] )
+
            else:
-            return None
+                ending = None
+
+        return (beginning, ending)

    @classmethod
-    def is_ending( cls, line ):
+    @deprecated( extract_dates )
+    def is_beginning( cls, line ):
        """
-        Returns the timestamp of done notice ( if one ), otherwise None
+        Returns the first timestamp found in line, otherwise None
+
        @param      str    line    String to search in

        @returns    str            Timestamp, otherwise None
        """

-        if ( cls.__done_notice in line ) or ( cls.__done_notice2 in line ):
-            match = cls.__timestamp_pat.search( line )
-            if match:
-                # Since some timestamps are broken we need to reconstruct them
-                # by regex match groups
-                result = match.group(1) + ", " + match.group(2) + ". " +\
-                    match.group(3) + ". " + match.group(4)
-                return result
-        return None
+        return cls.extract_dates( line )[0]

    @classmethod
+    @deprecated( extract_dates )
+    def is_ending( cls, line, isarchive=False ):
+        """
+        Returns the timestamp of done notice ( if one ), otherwise None
+
+        @param  line    String to search in
+        @type  line  str
+        @param  isarchive  If true skip searching done notice (on archivepages)
+        @type  isarchive  bool
+
+        @returns  Timestamp, otherwise None
+        @returntype  str
+        """
+
+        return cls.extract_dates( line )[1]
+
+    @classmethod
+    @deprecated( extract_dates )
    def is_ending2( cls, line ):
        """
        Returns the last timestamp found in line, otherwise None
@ -385,15 +444,7 @@ Maximum number of articles in red_fam exceeded, maximum number is 8, \
        @returns    str            Timestamp, otherwise None
        """

-        matches = cls.__timestamp_pat.findall( line )
-        if matches:
-            # Since some timestamps are broken we need to reconstruct them
-            # by regex match groups
-            result = matches[-1][0] + ", " + matches[-1][1] + ". " +\
-                matches[-1][2] + ". " + matches[-1][3]
-            return result
-        else:
-            return None
+        return cls.extract_dates( line, True )[1]


 class RedFamWorker( RedFam ):
--- a/redpage.py
+++ b/redpage.py
@ -26,9 +26,11 @@ Provides a class for handling redundance discussion pages and archives
 """

 import pywikibot  # noqa
+import mwparserfromhell as mwparser
+
+import jogobot

 from mysqlred import MysqlRedPage
-from redfam import RedFamParser


 class RedPage:
@ -52,10 +54,6 @@ class RedPage:
        self.is_page_changed()

        self._parsed = None
-        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
-            self.parse()
-        
-        self.__update_db()

    def __handle_db( self ):
        """
@ -94,78 +92,41 @@ class RedPage:
        else:
                return False

-    def parse( self ):  # noqa
+    def is_parsing_needed( self ):
        """
-        Handles the parsing process
+        Decides wether current RedPage needs to be parsed or not
        """

-        # Since @param text is a string we need to split it in lines
-        text_lines = self.page.text.split( "\n" )
-        length = len( text_lines )
-        
-        # Initialise line counter
-        i = 0
-        fam_heading = None
-        beginning = None
-        ending = None
-    
-        # Set line for last detected Redundance-Family to 0
-        last_fam = 0
-    
-        # Iterate over the lines of the page
-        for line in text_lines:
+        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
+            return True
+        else:
+            return False

-            # Check wether we have an "Redundance-Family"-Section heading
-            if RedFamParser.is_sectionheading( line ):
+    def parse( self ):
+        """
+        Handles the parsing process
+        """

-                # Save line number for last detected Redundance-Family
-                last_fam = i
-                # Save heading
-                fam_heading = line
+        # Generate Wikicode object
+        self.wikicode = mwparser.parse( self.page.text )

-                # Defined (re)initialisation of dates
-                beginning = None
-                ending = None
+        # Select RedFam-sections
+        # matches=Regexp or
+        #         function( gets heading content as wikicode as param 1)
+        # include_lead = if true include first section (intro)
+        # include_heading = if true include heading
+        fams = self.wikicode.get_sections(
+            matches=jogobot.config["redundances"]["section_heading_regex"],
+            include_lead=False, include_headings=True )

-            # Check wether we are currently in an "Redundance-Family"-Section
-            if i > last_fam and last_fam > 0:
+        # Iterate over RedFam
+        for fam in fams:

-                # Check if we have alredy recognized the beginning date of the
-                # discussion (in former iteration) or if we have a done-notice
-                if not beginning:
-                    beginning = RedFamParser.is_beginning( line )
-                elif not ending:
-                    ending = RedFamParser.is_ending( line )
+            yield fam

-            # Detect end of red_fam section (next line is new sectionheading)
-            # or end of file
-            # Prevent from running out of index
-            if i < (length - 1):
-                test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] )
        else:
-                test = False
-            if ( test or ( length == ( i + 1 ) ) ):
-                
-                # Create the red_fam object
-                if( fam_heading and beginning ):
-                    
-                    # Maybe we can find a ending by feed if we have None yet
-                    # (No done notice on archive pages)
-                    if not ending and self.is_archive():
-                        j = i
-                        while (j > last_fam) and not ending:
-                            j -= 1
-                            ending = RedFamParser.is_ending2( text_lines[ j ] )
-                    
-                    # Create the RedFam object
-                    RedFamParser( fam_heading, self.page._pageid,
-                                  self.is_archive(), beginning, ending )
-            
-            # Increment line counter
-            i += 1
-        else:
-            RedFamParser.flush_db_cache()
            self._parsed = True
+            self.__update_db()

    def __update_db( self ):
        """
--- a/tox.ini
+++ b/tox.ini
@ -1,2 +0,0 @@
-[flake8]
-ignore = E129,E201,E202,W293