From 74223079859caded0f526da1f9943f6d9806af87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?=
 <jonathan@golderweb.de>
Date: Thu, 3 Mar 2016 17:37:46 +0100
Subject: [PATCH] Rewrite RedPage.parse using mwparserfromhell to make it
 simpler

---
 redfam.py  |   8 ++--
 redpage.py | 130 +++++++++++++++++++----------------------------------
 2 files changed, 51 insertions(+), 87 deletions(-)

diff --git a/redfam.py b/redfam.py
index 048777a..1632841 100644
--- a/redfam.py
+++ b/redfam.py
@@ -126,8 +126,8 @@ class RedFamParser( RedFam ):
     __timestamp_format = jogobot.config['redundances']['timestamp_format']
 
     # Define section heading re.pattern
-    __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" )
-    
+    __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" )
+
     # Define timestamp re.pattern
     __timestamp_pat = re.compile( jogobot.config['redundances']
                                   ['timestamp_regex'] )
@@ -333,8 +333,8 @@ class RedFamParser( RedFam ):
 
         @returns    bool            Returns True if it is a section heading
         """
-        
-        if cls.__sectionhead_pat.search( line ):
+
+        if cls.__sectionhead_pat.search( str(line) ):
             return True
         else:
             return False
diff --git a/redpage.py b/redpage.py
index 96a853d..a35cf9e 100644
--- a/redpage.py
+++ b/redpage.py
@@ -26,6 +26,7 @@ Provides a class for handling redundance discussion pages and archives
 """
 
 import pywikibot  # noqa
+import mwparserfromhell as mwparser
 
 from mysqlred import MysqlRedPage
 from redfam import RedFamParser
@@ -35,44 +36,44 @@ class RedPage:
     """
     Class for handling redundance discussion pages and archives
     """
-    
+
     def __init__( self, page, archive=False ):
         """
         Generate a new RedPage object based on the given pywikibot page object
-    
+
         @param    page    page    Pywikibot/MediaWiki page object for page
         """
-        
+
         # Safe the pywikibot page object
         self.page = page
         self._archive = archive
-        
+
         self.__handle_db( )
 
         self.is_page_changed()
-        
+
         self._parsed = None
         if( self._changed or self.__mysql.data[ 'status' ] == 0 ):
             self.parse()
-        
+
         self.__update_db()
-    
+
     def __handle_db( self ):
         """
         Handles opening of db connection
         """
-        
+
         # We need a connection to our mysqldb
         self.__mysql = MysqlRedPage( self.page._pageid )
-        
+
         if not self.__mysql.data:
             self.__mysql.add_page( self.page.title(), self.page._revid )
-            
+
     def is_page_changed( self ):
         """
         Check wether the page was changed since last run
         """
-        
+
         if( self.__mysql.data != { 'page_id': self.page._pageid,
                                    'rev_id': self.page._revid,
                                    'page_title': self.page.title(),
@@ -85,102 +86,65 @@ class RedPage:
         """
         Detects wether current page is an archive of discussions
         """
-        
+
         if( self._archive or ( u"/Archiv" in self.page.title() ) or
             ( "{{Archiv}}" in self.page.text ) or
             ( "{{Archiv|" in self.page.text ) ):
-                
+
                 return True
         else:
                 return False
-        
-    def parse( self ):  # noqa
+
+    def parse( self ):
         """
         Handles the parsing process
         """
 
-        # Since @param text is a string we need to split it in lines
-        text_lines = self.page.text.split( "\n" )
-        length = len( text_lines )
-        
-        # Initialise line counter
-        i = 0
-        fam_heading = None
-        beginning = None
-        ending = None
-    
-        # Set line for last detected Redundance-Family to 0
-        last_fam = 0
-    
-        # Iterate over the lines of the page
-        for line in text_lines:
-            
-            # Check wether we have an "Redundance-Family"-Section heading
-            if RedFamParser.is_sectionheading( line ):
-                
-                # Save line number for last detected Redundance-Family
-                last_fam = i
-                # Save heading
-                fam_heading = line
-                
-                # Defined (re)initialisation of dates
-                beginning = None
-                ending = None
-            
-            # Check wether we are currently in an "Redundance-Family"-Section
-            if i > last_fam and last_fam > 0:
-            
-                # Check if we have alredy recognized the beginning date of the
-                # discussion (in former iteration) or if we have a done-notice
-                if not beginning:
-                    beginning = RedFamParser.is_beginning( line )
-                elif not ending:
-                    ending = RedFamParser.is_ending( line )
-            
-            # Detect end of red_fam section (next line is new sectionheading)
-            # or end of file
-            # Prevent from running out of index
-            if i < (length - 1):
-                test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] )
-            else:
-                test = False
-            if ( test or ( length == ( i + 1 ) ) ):
-                
-                # Create the red_fam object
-                if( fam_heading and beginning ):
-                    
-                    # Maybe we can find a ending by feed if we have None yet
-                    # (No done notice on archive pages)
-                    if not ending and self.is_archive():
-                        j = i
-                        while (j > last_fam) and not ending:
-                            j -= 1
-                            ending = RedFamParser.is_ending2( text_lines[ j ] )
-                    
-                    # Create the RedFam object
-                    RedFamParser( fam_heading, self.page._pageid,
-                                  self.is_archive(), beginning, ending )
-            
-            # Increment line counter
-            i += 1
+        # Generate Wikicode object
+        self.wikicode = mwparser.parse( self.page.text )
+
+        # Select RedFam-sections
+        # matches=Regexp or
+        #         function( gets heading content as wikicode as param 1)
+        # include_lead = if true include first section (intro)
+        # include_heading = if true include heading
+        fams = self.wikicode.get_sections(
+            matches=RedFamParser.is_sectionheading,
+            include_lead=False, include_headings=True )
+
+        # Iterate over RedFam
+        for fam in fams:
+
+            # Extract heading text
+            heading = next( fam.ifilter_headings() ).title
+
+            # Extract beginnig and maybe ending
+            (beginning, ending) = RedFamParser.extract_dates( fam,
+                                                              self.is_archive()
+                                                              )
+
+            # Create the RedFam object
+            RedFamParser( heading, self.page._pageid,
+                          self.is_archive(), beginning, ending )
+
         else:
             RedFamParser.flush_db_cache()
             self._parsed = True
-    
+
     def __update_db( self ):
         """
         Updates the page meta data in mysql db
         """
         if( self._parsed or not self._changed ):
             status = 1
-        
+
             if( self.is_archive() ):
                 status = 2
         else:
             status = 0
-            
+
         self.__mysql.update_page( self.page._revid, self.page.title(), status )
-        
+
     @classmethod
     def flush_db_cache( cls ):
         """