Files
jogobot-red/bots/reddiscparser.py
Jonathan Golder 6e973369cd sqlalchemy working for parser
Needs some testing, presumably contains some bugs
2017-03-09 00:08:48 +01:00

183 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# reddiscparser.py
#
# Copyright 2016 GOLDERWEB Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Bot to parse all reddisc pages in given Generator or configured categories
"""
import re
import pywikibot # noqa
from pywikibot import pagegenerators # noqa
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
import jogobot
from lib.redpage import RedPage
from lib.redfam import RedFamParser
class DiscussionParserBot(
# CurrentPageBot, # via next two sets 'current_page' on each treat()
ExistingPageBot, # CurrentPageBot only treats existing pages
NoRedirectPageBot ): # class which only treats non-redirects
"""
Botclass witch initialises the parsing process of Redundancy Discussions
"""
# RegEx to filter wrong pages
onlyinclude_re = re.compile(
jogobot.config["redundances"]["reddiscs_onlyinclude_re"] )
def __init__( self, genFactory, **kwargs ):
"""
Constructor
Parameters:
@param genFactory GenFactory with parsed pagegenerator args to
build generator
@type genFactory pagegenerators.GeneratorFactory
@param **kwargs Additional args
@type iterable
"""
# Copy needed args
self.genFactory = genFactory
# Build generator with genFactory
self.build_generator()
# Run super class init with builded generator
super( DiscussionParserBot, self ).__init__(generator=self.gen)
def build_generator(self):
"""
Builds generator to work on, based on self.genFactory
"""
# Check wether there are generators waiting for factoring, if not
# use configured categories
if not self.genFactory.gens:
self.apply_conf_cat_generators()
# Create combined Generator (Union of all Generators)
gen = self.genFactory.getCombinedGenerator()
if gen:
# The preloading generator is responsible for downloading multiple
# pages from the wiki simultaneously.
self.gen = pagegenerators.PreloadingGenerator(gen)
else:
pywikibot.showHelp()
def apply_conf_cat_generators( self ):
"""
Builds generators for categories which are read from jogobot.config
Parameters:
@param genFactory: The GeneratorFactory to which the builded
generators should be added.
@type genFactory: pagegenerators.GeneratorFactory
"""
# Create Generators for configured Categories
for category in jogobot.config["redundances"]["redpage_cats"]:
gen = self.genFactory.getCategoryGen(
category, gen_func=pagegenerators.CategorizedPageGenerator)
# If there is one, append to genFactory
if gen:
self.genFactory.gens.append(gen)
# Reset gen for next iteration
gen = None
def run( self ):
"""
Controls the overal parsing process, using super class for page switch
Needed to do things before/after treating pages is done
"""
try:
super( DiscussionParserBot, self ).run()
except:
raise
else:
# If successfully parsed all pages in cat, flush db write cache
RedPage.flush_db_cache()
def treat_page( self ):
"""
Handles work on current page
"""
# Short circuit excluded pages
if self.current_page.title() in (
jogobot.config["redundances"]["redpage_exclude"] ):
return
# Exclude pages which does not match pattern
if not type(self).onlyinclude_re.search( self.current_page.title() ):
return
# Initiate RedPage object
redpage = RedPage.session.query(RedPage).filter(RedPage.pageid == self.current_page.pageid ).one_or_none()
if redpage:
redpage.update( self.current_page )
else:
redpage = RedPage( self.current_page )
#~ # Check whether parsing is needed
if redpage.is_parsing_needed():
# Count families for failure analysis
fam_counter = 0
# Iterate over returned generator with redfam sections
for fam in redpage.parse():
# Run RedFamParser on section text
RedFamParser.parser( fam, redpage,
redpage.is_archive() )
fam_counter += 1
else:
# If successfully parsed whole page, flush
# db write cache
if( fam_counter ):
RedFamParser.flush_db_cache()
jogobot.output( "Page [[{reddisc}]] parsed".format(
reddisc=redpage.page.title() ) )
else:
jogobot.output(
"\03{red}" + "Page [[{reddisc}]], ".format(
reddisc=redpage.page.title() ) +
"containing no redfam, parsed!",
"WARNING" )