From dcc485151392a9c05d75e6b845d5c7d3fd1044a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Wed, 24 Aug 2016 15:27:42 +0200 Subject: [PATCH] Check reddisc page titles against regex To prevent parsing Pages which have been categorized in configured cats wrong or are given via cmd params Parsing them results in unexpected behaviour Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75] --- reddiscparser.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/reddiscparser.py b/reddiscparser.py index 6525ac9..00329e4 100644 --- a/reddiscparser.py +++ b/reddiscparser.py @@ -27,6 +27,7 @@ Script to parse all reddisc pages in configured categories import os import sys +import re import pywikibot from pywikibot import pagegenerators @@ -46,6 +47,10 @@ class DiscussionParserBot( Botclass witch initialises the parsing process of Redundancy Discussions """ + # RegEx to filter wrong pages + onlyinclude_re = re.compile( + jogobot.config["redundances"]["reddiscs_onlyinclude_re"] ) + def __init__( self, generator ): """ Constructor @@ -86,6 +91,11 @@ class DiscussionParserBot( return + # Exclude pages which does not match pattern + if not type(self).onlyinclude_re.search( self.current_page.title() ): + + return + # Initiate RedPage object red_page = redpage.RedPage( self.current_page ) @@ -102,7 +112,7 @@ class DiscussionParserBot( # If successfully parsed whole page, flush # db write cache redfam.RedFamParser.flush_db_cache() - jogobot.output( "Page [[{redisc}]] parsed".format( + jogobot.output( "Page [[{reddisc}]] parsed".format( reddisc=red_page.page.title() ) )