Rewrite RedPage.parse using mwparserfromhell to make it simpler

2016-03-03 17:37:46 +01:00
parent b81694c6d3
commit 7422307985
2 changed files with 51 additions and 87 deletions
--- a/redfam.py
+++ b/redfam.py
@@ -126,8 +126,8 @@ class RedFamParser( RedFam ):
    __timestamp_format = jogobot.config['redundances']['timestamp_format']
    # Define section heading re.pattern
-    __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" )
+    __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" )
-    
+
    # Define timestamp re.pattern
    __timestamp_pat = re.compile( jogobot.config['redundances']
                                  ['timestamp_regex'] )
@@ -333,8 +333,8 @@ class RedFamParser( RedFam ):
        @returns    bool            Returns True if it is a section heading
        """
-        
+
-        if cls.__sectionhead_pat.search( line ):
+        if cls.__sectionhead_pat.search( str(line) ):
            return True
        else:
            return False
--- a/redpage.py
+++ b/redpage.py
@@ -26,6 +26,7 @@ Provides a class for handling redundance discussion pages and archives
 """
 import pywikibot  # noqa
 import mwparserfromhell as mwparser
 from mysqlred import MysqlRedPage
 from redfam import RedFamParser
@@ -35,44 +36,44 @@ class RedPage:
    """
    Class for handling redundance discussion pages and archives
    """
-    
+
    def __init__( self, page, archive=False ):
        """
        Generate a new RedPage object based on the given pywikibot page object
-    
+
        @param    page    page    Pywikibot/MediaWiki page object for page
        """
-        
+
        # Safe the pywikibot page object
        self.page = page
        self._archive = archive
-        
+
        self.__handle_db( )
        self.is_page_changed()
-        
+
        self._parsed = None
        if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
            self.parse()
-        
+
        self.__update_db()
-    
+
    def __handle_db( self ):
        """
        Handles opening of db connection
        """
-        
+
        # We need a connection to our mysqldb
        self.__mysql = MysqlRedPage( self.page._pageid )
-        
+
        if not self.__mysql.data:
            self.__mysql.add_page( self.page.title(), self.page._revid )
-            
+
    def is_page_changed( self ):
        """
        Check wether the page was changed since last run
        """
-        
+
        if( self.__mysql.data != { 'page_id': self.page._pageid,
                                   'rev_id': self.page._revid,
                                   'page_title': self.page.title(),
@@ -85,102 +86,65 @@ class RedPage:
        """
        Detects wether current page is an archive of discussions
        """
-        
+
        if( self._archive or ( u"/Archiv" in self.page.title() ) or
            ( "{{Archiv}}" in self.page.text ) or
            ( "{{Archiv|" in self.page.text ) ):
-                
+
                return True
        else:
                return False
-        
+
-    def parse( self ):  # noqa
+    def parse( self ):
        """
        Handles the parsing process
        """
-        # Since @param text is a string we need to split it in lines
+        # Generate Wikicode object
-        text_lines = self.page.text.split( "\n" )
+        self.wikicode = mwparser.parse( self.page.text )
-        length = len( text_lines )
+
-        
+        # Select RedFam-sections
-        # Initialise line counter
+        # matches=Regexp or
-        i = 0
+        #         function( gets heading content as wikicode as param 1)
-        fam_heading = None
+        # include_lead = if true include first section (intro)
-        beginning = None
+        # include_heading = if true include heading
-        ending = None
+        fams = self.wikicode.get_sections(
-    
+            matches=RedFamParser.is_sectionheading,
-        # Set line for last detected Redundance-Family to 0
+            include_lead=False, include_headings=True )
-        last_fam = 0
+
-    
+        # Iterate over RedFam
-        # Iterate over the lines of the page
+        for fam in fams:
-        for line in text_lines:
+
-            
+            # Extract heading text
-            # Check wether we have an "Redundance-Family"-Section heading
+            heading = next( fam.ifilter_headings() ).title
-            if RedFamParser.is_sectionheading( line ):
+
-                
+            # Extract beginnig and maybe ending
-                # Save line number for last detected Redundance-Family
+            (beginning, ending) = RedFamParser.extract_dates( fam,
-                last_fam = i
+                                                              self.is_archive()
-                # Save heading
+                                                              )
-                fam_heading = line
+
-                
+            # Create the RedFam object
-                # Defined (re)initialisation of dates
+            RedFamParser( heading, self.page._pageid,
-                beginning = None
+                          self.is_archive(), beginning, ending )
-                ending = None
+
            # Check wether we are currently in an "Redundance-Family"-Section
            if i > last_fam and last_fam > 0:
                # Check if we have alredy recognized the beginning date of the
                # discussion (in former iteration) or if we have a done-notice
                if not beginning:
                    beginning = RedFamParser.is_beginning( line )
                elif not ending:
                    ending = RedFamParser.is_ending( line )
            # Detect end of red_fam section (next line is new sectionheading)
            # or end of file
            # Prevent from running out of index
            if i < (length - 1):
                test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] )
            else:
                test = False
            if ( test or ( length == ( i + 1 ) ) ):
                # Create the red_fam object
                if( fam_heading and beginning ):
                    # Maybe we can find a ending by feed if we have None yet
                    # (No done notice on archive pages)
                    if not ending and self.is_archive():
                        j = i
                        while (j > last_fam) and not ending:
                            j -= 1
                            ending = RedFamParser.is_ending2( text_lines[ j ] )
                    # Create the RedFam object
                    RedFamParser( fam_heading, self.page._pageid,
                                  self.is_archive(), beginning, ending )
            # Increment line counter
            i += 1
        else:
            RedFamParser.flush_db_cache()
            self._parsed = True
-    
+
    def __update_db( self ):
        """
        Updates the page meta data in mysql db
        """
        if( self._parsed or not self._changed ):
            status = 1
-        
+
            if( self.is_archive() ):
                status = 2
        else:
            status = 0
-            
+
        self.__mysql.update_page( self.page._revid, self.page.title(), status )
-        
+
    @classmethod
    def flush_db_cache( cls ):
        """