You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

182 lines
5.8 KiB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# reddiscparser.py
#
# Copyright 2017 Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Bot to parse all reddisc pages in given Generator or configured categories
"""
import re
import pywikibot # noqa
from pywikibot import pagegenerators # noqa
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
import jogobot
from lib.redpage import RedPageParser
from lib.redfam import RedFamParser
class DiscussionParserBot(
# CurrentPageBot, # via next two sets 'current_page' on each treat()
ExistingPageBot, # CurrentPageBot only treats existing pages
NoRedirectPageBot ): # class which only treats non-redirects
"""
Botclass witch initialises the parsing process of Redundancy Discussions
"""
# RegEx to filter wrong pages
onlyinclude_re = re.compile(
jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )
def __init__( self, genFactory, **kwargs ):
"""
Constructor
Parameters:
@param genFactory GenFactory with parsed pagegenerator args to
build generator
@type genFactory pagegenerators.GeneratorFactory
@param **kwargs Additional args
@type iterable
"""
# Copy needed args
self.genFactory = genFactory
# Build generator with genFactory
self.build_generator()
# Run super class init with builded generator
super( DiscussionParserBot, self ).__init__(generator=self.gen)
def build_generator(self):
"""
Builds generator to work on, based on self.genFactory
"""
# Check wether there are generators waiting for factoring, if not
# use configured categories
if not self.genFactory.gens:
self.apply_conf_cat_generators()
# Create combined Generator (Union of all Generators)
gen = self.genFactory.getCombinedGenerator()
if gen:
# The preloading generator is responsible for downloading multiple
# pages from the wiki simultaneously.
self.gen = pagegenerators.PreloadingGenerator(gen)
else:
pywikibot.showHelp()
def apply_conf_cat_generators( self ):
"""
Builds generators for categories which are read from jogobot.config
Parameters:
@param genFactory: The GeneratorFactory to which the builded
generators should be added.
@type genFactory: pagegenerators.GeneratorFactory
"""
# Create Generators for configured Categories
for category in jogobot.config["redundances"]["redpage_cats"]:
gen = self.genFactory.getCategoryGen(
category, gen_func=pagegenerators.CategorizedPageGenerator)
# If there is one, append to genFactory
if gen:
self.genFactory.gens.append(gen)
# Reset gen for next iteration
gen = None
def run( self ):
"""
Controls the overal parsing process, using super class for page switch
Needed to do things before/after treating pages is done
"""
try:
super( DiscussionParserBot, self ).run()
except:
raise
else:
# If successfully parsed all pages in cat, flush db write cache
RedPageParser.flush_db_cache()
def treat_page( self ):
"""
Handles work on current page
"""
# Short circuit excluded pages
if self.current_page.title() in (
jogobot.config["redundances"]["redpage_exclude"] ):
return
# Exclude pages which does not match pattern
if not type(self).onlyinclude_re.search( self.current_page.title() ):
return
# Initiate RedPage object
redpage = RedPageParser.session.query(RedPageParser).filter(
RedPageParser.pageid == self.current_page.pageid ).one_or_none()
if redpage:
redpage.update( self.current_page )
else:
redpage = RedPageParser( self.current_page )
# Check whether parsing is needed
if redpage.is_parsing_needed():
# Count families for failure analysis
fam_counter = 0
# Iterate over returned generator with redfam sections
for fam in redpage.parse():
# Run RedFamParser on section text
RedFamParser.parser( fam, redpage, redpage.archive )
fam_counter += 1
else:
# If successfully parsed whole page, flush
# db write cache
if( fam_counter ):
RedFamParser.flush_db_cache()
jogobot.output( "Page [[{reddisc}]] parsed".format(
reddisc=redpage.page.title() ) )
else:
jogobot.output(
"\03{red}" + "Page [[{reddisc}]], ".format(
reddisc=redpage.page.title() ) +
"containing no redfam, parsed!",
"WARNING" )