|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# reddiscparser.py
|
|
|
|
#
|
|
|
|
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
|
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
|
# MA 02110-1301, USA.
|
|
|
|
#
|
|
|
|
#
|
|
|
|
"""
|
|
|
|
Bot to parse all reddisc pages in given Generator or configured categories
|
|
|
|
"""
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import pywikibot # noqa
|
|
|
|
from pywikibot import pagegenerators # noqa
|
|
|
|
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
|
|
|
|
|
|
|
|
import jogobot
|
|
|
|
|
|
|
|
from lib import redpage
|
|
|
|
from lib import redfam
|
|
|
|
|
|
|
|
|
|
|
|
class DiscussionParserBot(
|
|
|
|
# CurrentPageBot, # via next two sets 'current_page' on each treat()
|
|
|
|
ExistingPageBot, # CurrentPageBot only treats existing pages
|
|
|
|
NoRedirectPageBot ): # class which only treats non-redirects
|
|
|
|
"""
|
|
|
|
Botclass witch initialises the parsing process of Redundancy Discussions
|
|
|
|
"""
|
|
|
|
|
|
|
|
# RegEx to filter wrong pages
|
|
|
|
onlyinclude_re = re.compile(
|
|
|
|
jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )
|
|
|
|
|
|
|
|
def __init__( self, generator ):
|
|
|
|
"""
|
|
|
|
Constructor
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
@param generator: The page generator that determines on which pages
|
|
|
|
to work.
|
|
|
|
@type generator: generator.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def build_generator(self):
|
|
|
|
"""
|
|
|
|
Builds generator to work on, based on self.genFactory
|
|
|
|
"""
|
|
|
|
# Check wether there are generators waiting for factoring, if not
|
|
|
|
# use configured categories
|
|
|
|
if not self.genFactory.gens:
|
|
|
|
self.apply_conf_cat_generators()
|
|
|
|
|
|
|
|
# Create combined Generator (Union of all Generators)
|
|
|
|
gen = self.genFactory.getCombinedGenerator()
|
|
|
|
|
|
|
|
if gen:
|
|
|
|
# The preloading generator is responsible for downloading multiple
|
|
|
|
# pages from the wiki simultaneously.
|
|
|
|
self.gen = pagegenerators.PreloadingGenerator(gen)
|
|
|
|
|
|
|
|
else:
|
|
|
|
pywikibot.showHelp()
|
|
|
|
|
|
|
|
def apply_conf_cat_generators( self ):
|
|
|
|
"""
|
|
|
|
Builds generators for categories which are read from jogobot.config
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
@param genFactory: The GeneratorFactory to which the builded
|
|
|
|
generators should be added.
|
|
|
|
@type genFactory: pagegenerators.GeneratorFactory
|
|
|
|
"""
|
|
|
|
# Create Generators for configured Categories
|
|
|
|
for category in jogobot.config["redundances"]["redpage_cats"]:
|
|
|
|
gen = self.genFactory.getCategoryGen(
|
|
|
|
category, gen_func=pagegenerators.CategorizedPageGenerator)
|
|
|
|
|
|
|
|
# If there is one, append to genFactory
|
|
|
|
if gen:
|
|
|
|
self.genFactory.gens.append(gen)
|
|
|
|
|
|
|
|
# Reset gen for next iteration
|
|
|
|
gen = None
|
|
|
|
|
|
|
|
def run( self ):
|
|
|
|
"""
|
|
|
|
Controls the overal parsing process, using super class for page switch
|
|
|
|
|
|
|
|
Needed to do things before/after treating pages is done
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
|
|
|
|
super( DiscussionParserBot, self ).run()
|
|
|
|
|
|
|
|
except:
|
|
|
|
raise
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# If successfully parsed all pages in cat, flush db write cache
|
|
|
|
redpage.RedPage.flush_db_cache()
|
|
|
|
|
|
|
|
def treat_page( self ):
|
|
|
|
"""
|
|
|
|
Handles work on current page
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Short circuit excluded pages
|
|
|
|
if self.current_page.title() in (
|
|
|
|
jogobot.config["redundances"]["redpage_exclude"] ):
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
# Exclude pages which does not match pattern
|
|
|
|
if not type(self).onlyinclude_re.search( self.current_page.title() ):
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
# Initiate RedPage object
|
|
|
|
red_page = redpage.RedPage( self.current_page )
|
|
|
|
|
|
|
|
# Check whether parsing is needed
|
|
|
|
if red_page.is_parsing_needed():
|
|
|
|
|
|
|
|
# Count families for failure analysis
|
|
|
|
fam_counter = 0
|
|
|
|
|
|
|
|
# Iterate over returned generator with redfam sections
|
|
|
|
for fam in red_page.parse():
|
|
|
|
|
|
|
|
# Run RedFamParser on section text
|
|
|
|
redfam.RedFamParser.parser( fam, red_page.page,
|
|
|
|
red_page.is_archive() )
|
|
|
|
|
|
|
|
fam_counter += 1
|
|
|
|
|
|
|
|
else:
|
|
|
|
# If successfully parsed whole page, flush
|
|
|
|
# db write cache
|
|
|
|
if( fam_counter ):
|
|
|
|
redfam.RedFamParser.flush_db_cache()
|
|
|
|
jogobot.output( "Page [[{reddisc}]] parsed".format(
|
|
|
|
reddisc=red_page.page.title() ) )
|
|
|
|
else:
|
|
|
|
jogobot.output(
|
|
|
|
"\03{red}" + "Page [[{reddisc}]], ".format(
|
|
|
|
reddisc=red_page.page.title() ) +
|
|
|
|
"containing no redfam, parsed!",
|
|
|
|
"WARNING" )
|