From 74223079859caded0f526da1f9943f6d9806af87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Thu, 3 Mar 2016 17:37:46 +0100 Subject: [PATCH] Rewrite RedPage.parse using mwparserfromhell to make it simpler --- redfam.py | 8 ++-- redpage.py | 130 +++++++++++++++++++---------------------------------- 2 files changed, 51 insertions(+), 87 deletions(-) diff --git a/redfam.py b/redfam.py index 048777a..1632841 100644 --- a/redfam.py +++ b/redfam.py @@ -126,8 +126,8 @@ class RedFamParser( RedFam ): __timestamp_format = jogobot.config['redundances']['timestamp_format'] # Define section heading re.pattern - __sectionhead_pat = re.compile( r"^(=+)(.*\[\[.+\]\].*\[\[.+\]\].*)\1" ) - + __sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" ) + # Define timestamp re.pattern __timestamp_pat = re.compile( jogobot.config['redundances'] ['timestamp_regex'] ) @@ -333,8 +333,8 @@ class RedFamParser( RedFam ): @returns bool Returns True if it is a section heading """ - - if cls.__sectionhead_pat.search( line ): + + if cls.__sectionhead_pat.search( str(line) ): return True else: return False diff --git a/redpage.py b/redpage.py index 96a853d..a35cf9e 100644 --- a/redpage.py +++ b/redpage.py @@ -26,6 +26,7 @@ Provides a class for handling redundance discussion pages and archives """ import pywikibot # noqa +import mwparserfromhell as mwparser from mysqlred import MysqlRedPage from redfam import RedFamParser @@ -35,44 +36,44 @@ class RedPage: """ Class for handling redundance discussion pages and archives """ - + def __init__( self, page, archive=False ): """ Generate a new RedPage object based on the given pywikibot page object - + @param page page Pywikibot/MediaWiki page object for page """ - + # Safe the pywikibot page object self.page = page self._archive = archive - + self.__handle_db( ) self.is_page_changed() - + self._parsed = None if( self._changed or self.__mysql.data[ 'status' ] == 0 ): self.parse() - + self.__update_db() - + def __handle_db( self ): """ Handles opening of db connection """ - + # We need a connection to our mysqldb self.__mysql = MysqlRedPage( self.page._pageid ) - + if not self.__mysql.data: self.__mysql.add_page( self.page.title(), self.page._revid ) - + def is_page_changed( self ): """ Check wether the page was changed since last run """ - + if( self.__mysql.data != { 'page_id': self.page._pageid, 'rev_id': self.page._revid, 'page_title': self.page.title(), @@ -85,102 +86,65 @@ class RedPage: """ Detects wether current page is an archive of discussions """ - + if( self._archive or ( u"/Archiv" in self.page.title() ) or ( "{{Archiv}}" in self.page.text ) or ( "{{Archiv|" in self.page.text ) ): - + return True else: return False - - def parse( self ): # noqa + + def parse( self ): """ Handles the parsing process """ - # Since @param text is a string we need to split it in lines - text_lines = self.page.text.split( "\n" ) - length = len( text_lines ) - - # Initialise line counter - i = 0 - fam_heading = None - beginning = None - ending = None - - # Set line for last detected Redundance-Family to 0 - last_fam = 0 - - # Iterate over the lines of the page - for line in text_lines: - - # Check wether we have an "Redundance-Family"-Section heading - if RedFamParser.is_sectionheading( line ): - - # Save line number for last detected Redundance-Family - last_fam = i - # Save heading - fam_heading = line - - # Defined (re)initialisation of dates - beginning = None - ending = None - - # Check wether we are currently in an "Redundance-Family"-Section - if i > last_fam and last_fam > 0: - - # Check if we have alredy recognized the beginning date of the - # discussion (in former iteration) or if we have a done-notice - if not beginning: - beginning = RedFamParser.is_beginning( line ) - elif not ending: - ending = RedFamParser.is_ending( line ) - - # Detect end of red_fam section (next line is new sectionheading) - # or end of file - # Prevent from running out of index - if i < (length - 1): - test = RedFamParser.is_sectionheading( text_lines[ i + 1 ] ) - else: - test = False - if ( test or ( length == ( i + 1 ) ) ): - - # Create the red_fam object - if( fam_heading and beginning ): - - # Maybe we can find a ending by feed if we have None yet - # (No done notice on archive pages) - if not ending and self.is_archive(): - j = i - while (j > last_fam) and not ending: - j -= 1 - ending = RedFamParser.is_ending2( text_lines[ j ] ) - - # Create the RedFam object - RedFamParser( fam_heading, self.page._pageid, - self.is_archive(), beginning, ending ) - - # Increment line counter - i += 1 + # Generate Wikicode object + self.wikicode = mwparser.parse( self.page.text ) + + # Select RedFam-sections + # matches=Regexp or + # function( gets heading content as wikicode as param 1) + # include_lead = if true include first section (intro) + # include_heading = if true include heading + fams = self.wikicode.get_sections( + matches=RedFamParser.is_sectionheading, + include_lead=False, include_headings=True ) + + # Iterate over RedFam + for fam in fams: + + # Extract heading text + heading = next( fam.ifilter_headings() ).title + + # Extract beginnig and maybe ending + (beginning, ending) = RedFamParser.extract_dates( fam, + self.is_archive() + ) + + # Create the RedFam object + RedFamParser( heading, self.page._pageid, + self.is_archive(), beginning, ending ) + else: RedFamParser.flush_db_cache() self._parsed = True - + def __update_db( self ): """ Updates the page meta data in mysql db """ if( self._parsed or not self._changed ): status = 1 - + if( self.is_archive() ): status = 2 else: status = 0 - + self.__mysql.update_page( self.page._revid, self.page.title(), status ) - + @classmethod def flush_db_cache( cls ): """