Rewrite RedPage.parse using mwparserfromhell to make it simpler

2016-03-03 17:37:46 +01:00
parent b81694c6d3
commit 7422307985
2 changed files with 51 additions and 87 deletions
--- a/redpage.py
+++ b/redpage.py
@@ -26,6 +26,7 @@ Provides a class for handling redundance discussion pages and archives
 """

 import pywikibot  # noqa
+import mwparserfromhell as mwparser

 from mysqlred import MysqlRedPage
 from redfam import RedFamParser
@@ -35,44 +36,44 @@ class RedPage:
    """
    Class for handling redundance discussion pages and archives
    """
-    
+
    def __init__( self, page, archive=False ):
        """
        Generate a new RedPage object based on the given pywikibot page object
-    
+
        @param    page    page    Pywikibot/MediaWiki page object for page
        """
-        
+
        # Safe the pywikibot page object
        self.page = page
        self._archive = archive
-        
+
        self.__handle_db( )

        self.is_page_changed()
-        
+
        self._parsed = None
        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
            self.parse()
-        
+
        self.__update_db()
-    
+
    def __handle_db( self ):
        """
        Handles opening of db connection
        """
-        
+
        # We need a connection to our mysqldb
        self.__mysql = MysqlRedPage( self.page._pageid )
-        
+
        if not self.__mysql.data:
            self.__mysql.add_page( self.page.title(), self.page._revid )
-            
+
    def is_page_changed( self ):
        """
        Check wether the page was changed since last run
        """
-        
+
        if( self.__mysql.data != { 'page_id': self.page._pageid,
                                   'rev_id': self.page._revid,
                                   'page_title': self.page.title(),
@@ -85,102 +86,65 @@ class RedPage:
        """
        Detects wether current page is an archive of discussions
        """
-        
+
        if( self._archive or ( u"/Archiv" in self.page.title() ) or
            ( "{{Archiv}}" in self.page.text ) or
            ( "{{Archiv|" in self.page.text ) ):
-                
+
                return True
        else:
                return False
-        
-    def parse( self ):  # noqa
+
+    def parse( self ):
        """
        Handles the parsing process
        """

-        # Since @param text is a string we need to split it in lines
-        text_lines = self.page.text.split( "\n" )
-        length = len( text_lines )
-        
-        # Initialise line counter
-        i = 0
-        fam_heading = None
-        beginning = None
-        ending = None
-    
-        # Set line for last detected Redundance-Family to 0
-        last_fam = 0
-    
-        # Iterate over the lines of the page
-        for line in text_lines:
-            
-            # Check wether we have an "Redundance-Family"-Section heading
-            if RedFamParser.is_sectionheading( line ):
-                
-                # Save line number for last detected Redundance-Family
-                last_fam = i
-                # Save heading
-                fam_heading = line
-                
-                # Defined (re)initialisation of dates
-                beginning = None
-                ending = None
-            
-            # Check wether we are currently in an "Redundance-Family"-Section
-            if i > last_fam and last_fam > 0:
-            
-                # Check if we have alredy recognized the beginning date of the
-                # discussion (in former iteration) or if we have a done-notice
-                if not beginning:
-                    beginning = RedFamParser.is_beginning( line )
-                elif not ending:
-                    ending = RedFamParser.is_ending( line )
-            
-            # Detect end of red_fam section (next line is new sectionheading)
-            # or end of file
-            # Prevent from running out of index
-            if i < (length - 1):
-                test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] )
-            else:
-                test = False
-            if ( test or ( length == ( i + 1 ) ) ):
-                
-                # Create the red_fam object
-                if( fam_heading and beginning ):
-                    
-                    # Maybe we can find a ending by feed if we have None yet
-                    # (No done notice on archive pages)
-                    if not ending and self.is_archive():
-                        j = i
-                        while (j > last_fam) and not ending:
-                            j -= 1
-                            ending = RedFamParser.is_ending2( text_lines[ j ] )
-                    
-                    # Create the RedFam object
-                    RedFamParser( fam_heading, self.page._pageid,
-                                  self.is_archive(), beginning, ending )
-            
-            # Increment line counter
-            i += 1
+        # Generate Wikicode object
+        self.wikicode = mwparser.parse( self.page.text )
+
+        # Select RedFam-sections
+        # matches=Regexp or
+        #         function( gets heading content as wikicode as param 1)
+        # include_lead = if true include first section (intro)
+        # include_heading = if true include heading
+        fams = self.wikicode.get_sections(
+            matches=RedFamParser.is_sectionheading,
+            include_lead=False, include_headings=True )
+
+        # Iterate over RedFam
+        for fam in fams:
+
+            # Extract heading text
+            heading = next( fam.ifilter_headings() ).title
+
+            # Extract beginnig and maybe ending
+            (beginning, ending) = RedFamParser.extract_dates( fam,
+                                                              self.is_archive()
+                                                              )
+
+            # Create the RedFam object
+            RedFamParser( heading, self.page._pageid,
+                          self.is_archive(), beginning, ending )
+
        else:
            RedFamParser.flush_db_cache()
            self._parsed = True
-    
+
    def __update_db( self ):
        """
        Updates the page meta data in mysql db
        """
        if( self._parsed or not self._changed ):
            status = 1
-        
+
            if( self.is_archive() ):
                status = 2
        else:
            status = 0
-            
+
        self.__mysql.update_page( self.page._revid, self.page.title(), status )
-        
+
    @classmethod
    def flush_db_cache( cls ):
        """