To prevent parsing Pages which have been categorized in configured cats wrong or are given via cmd params Parsing them results in unexpected behaviour Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=75 FS#75]
218 lines
6.7 KiB
Python
218 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# reddiscparser.py
|
||
#
|
||
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
|
||
#
|
||
# This program is free software; you can redistribute it and/or modify
|
||
# it under the terms of the GNU General Public License as published by
|
||
# the Free Software Foundation; either version 2 of the License, or
|
||
# (at your option) any later version.
|
||
#
|
||
# This program is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
# GNU General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with this program; if not, write to the Free Software
|
||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||
# MA 02110-1301, USA.
|
||
#
|
||
#
|
||
"""
|
||
Script to parse all reddisc pages in configured categories
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import re
|
||
|
||
import pywikibot
|
||
from pywikibot import pagegenerators
|
||
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
|
||
|
||
import jogobot
|
||
|
||
import redpage
|
||
import redfam
|
||
|
||
|
||
class DiscussionParserBot(
|
||
# CurrentPageBot, # via next two sets 'current_page' on each treat()
|
||
ExistingPageBot, # CurrentPageBot only treats existing pages
|
||
NoRedirectPageBot ): # class which only treats non-redirects
|
||
"""
|
||
Botclass witch initialises the parsing process of Redundancy Discussions
|
||
"""
|
||
|
||
# RegEx to filter wrong pages
|
||
onlyinclude_re = re.compile(
|
||
jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )
|
||
|
||
def __init__( self, generator ):
|
||
"""
|
||
Constructor
|
||
|
||
Parameters:
|
||
@param generator: The page generator that determines on which pages
|
||
to work.
|
||
@type generator: generator.
|
||
"""
|
||
super( DiscussionParserBot, self ).__init__(generator=generator)
|
||
|
||
def run( self ):
|
||
"""
|
||
Controls the overal parsing process, using super class for page switch
|
||
|
||
Needed to do things before/after treating pages is done
|
||
"""
|
||
try:
|
||
|
||
super( DiscussionParserBot, self ).run()
|
||
|
||
except:
|
||
raise
|
||
|
||
else:
|
||
|
||
# If successfully parsed all pages in cat, flush db write cache
|
||
redpage.RedPage.flush_db_cache()
|
||
|
||
def treat_page( self ):
|
||
"""
|
||
Handles work on current page
|
||
"""
|
||
|
||
# Short circuit excluded pages
|
||
if self.current_page.title() in (
|
||
jogobot.config["redundances"]["redpage_exclude"] ):
|
||
|
||
return
|
||
|
||
# Exclude pages which does not match pattern
|
||
if not type(self).onlyinclude_re.search( self.current_page.title() ):
|
||
|
||
return
|
||
|
||
# Initiate RedPage object
|
||
red_page = redpage.RedPage( self.current_page )
|
||
|
||
# Check whether parsing is needed
|
||
if red_page.is_parsing_needed():
|
||
|
||
# Iterate over returned generator with redfam sections
|
||
for fam in red_page.parse():
|
||
|
||
# Run RedFamParser on section text
|
||
redfam.RedFamParser.parser( fam, red_page.page._pageid,
|
||
red_page.is_archive() )
|
||
else:
|
||
# If successfully parsed whole page, flush
|
||
# db write cache
|
||
redfam.RedFamParser.flush_db_cache()
|
||
jogobot.output( "Page [[{reddisc}]] parsed".format(
|
||
reddisc=red_page.page.title() ) )
|
||
|
||
|
||
def apply_conf_cat_generators( genFactory ):
|
||
"""
|
||
Builds generators for categories which are read from jogobot.config
|
||
|
||
Parameters:
|
||
@param genFactory: The GeneratorFactory to which the builded generators
|
||
should be added.
|
||
@type genFactory: pagegenerators.GeneratorFactory
|
||
"""
|
||
# Create Generators for configured Categories
|
||
for category in jogobot.config["redundances"]["redpage_cats"]:
|
||
cgen = genFactory.getCategoryGen(
|
||
category, gen_func=pagegenerators.CategorizedPageGenerator)
|
||
|
||
# If there is one, append to genFactory
|
||
if cgen:
|
||
genFactory.gens.append(cgen)
|
||
|
||
|
||
def main(*args):
|
||
"""
|
||
Process command line arguments and invoke bot.
|
||
|
||
If args is an empty list, sys.argv is used.
|
||
|
||
@param args: command line arguments
|
||
@type args: list of unicode
|
||
"""
|
||
|
||
# Process global arguments to determine desired site
|
||
local_args = pywikibot.handle_args(args)
|
||
|
||
# Get the jogobot-task_slug (basename of current file without ending)
|
||
task_slug = os.path.basename(__file__)[:-len(".py")]
|
||
|
||
# Before run, we need to check wether we are currently active or not
|
||
try:
|
||
# Will throw Exception if disabled/blocked
|
||
# jogobot.is_active( task_slug )
|
||
pass
|
||
|
||
except jogobot.jogobot.Blocked:
|
||
(type, value, traceback) = sys.exc_info()
|
||
jogobot.output( "\03{lightpurple} %s (%s)" % (value, type ),
|
||
"CRITICAL" )
|
||
|
||
except jogobot.jogobot.Disabled:
|
||
(type, value, traceback) = sys.exc_info()
|
||
jogobot.output( "\03{red} %s (%s)" % (value, type ),
|
||
"ERROR" )
|
||
|
||
# Bot/Task is active
|
||
else:
|
||
|
||
# This factory is responsible for processing command line arguments
|
||
# that are also used by other scripts and that determine on which pages
|
||
# to work on.
|
||
genFactory = pagegenerators.GeneratorFactory()
|
||
# The generator gives the pages that should be worked upon.
|
||
gen = None
|
||
|
||
# If always is True, bot won't ask for confirmation of edit (automode)
|
||
# always = False
|
||
|
||
# If force_reload is True, bot will always parse Countrylist regardless
|
||
# if parsing is needed or not
|
||
# force_reload = False
|
||
|
||
# Parse command line arguments
|
||
for arg in local_args:
|
||
if arg.startswith("-always"):
|
||
# always = True
|
||
pass
|
||
else:
|
||
genFactory.handleArg(arg)
|
||
|
||
if not gen:
|
||
|
||
# Check wether there are generators waiting for factoring, if not
|
||
# use configured categories
|
||
if not genFactory.gens:
|
||
apply_conf_cat_generators( genFactory )
|
||
|
||
# Create combined Generator (Union of all Generators)
|
||
gen = genFactory.getCombinedGenerator()
|
||
|
||
if gen:
|
||
# Log beginning of parsing
|
||
jogobot.output( "{task_slug} invoked".format(task_slug=task_slug) )
|
||
|
||
# The preloading generator is responsible for downloading multiple
|
||
# pages from the wiki simultaneously.
|
||
gen = pagegenerators.PreloadingGenerator(gen)
|
||
DiscussionParserBot( gen ).run()
|
||
else:
|
||
pywikibot.showHelp()
|
||
|
||
if( __name__ == "__main__" ):
|
||
main()
|