jogobot-red/reddiscparser.py

#!/usr/bin/env python3
# -*- coding: utf-8  -*-
#
#  parse-pages.py
#
#  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#
"""
Script to parse all redpages in configured categories
"""

import pywikibot
from pywikibot import pagegenerators

import jogobot

import redpage
import redfam


def get_cat_pages( cat ):
    """
    Generates a iteratable generator-object with all pages listet in given
    category

    @param  cat  Category to request
    @type  cat  str

    @returns  generator  Iteratable object with pages of given category
    """

    # Get site to work on from pywikibot config
    site = pywikibot.Site()

    # Retrieve the content of given category
    category = pywikibot.Category( site, cat )

    # Build an iteratable generator object with page objects for given category
    generator = pagegenerators.CategorizedPageGenerator( category )

    return generator


def main(*args):
    """
    Handles process
    """

    try:
        jogobot.output( "BEGINN – parser-pages.py" )

        # Iterate over configured categories
        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):

            # Iterate over pages in current cat
            for page in get_cat_pages( cat ):

                # For pages configured to exclude, go on with next page
                if page.title() in (
                        jogobot.config["redundances"]["redpage_exclude"] ):

                    continue

                # Initiate RedPage object
                red_page = redpage.RedPage( page )

                # Check whether parsing is needed
                if red_page.is_parsing_needed():

                    # Iterate over returned generator with redfam sections
                    for fam in red_page.parse():

                        # Run RedFamParser on section text
                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
                                                    red_page.is_archive() )
                    else:
                        # If successfully parsed whole page, flush
                        # db write cache
                        redfam.RedFamParser.flush_db_cache()
                        jogobot.output( "Page '%s' parsed" %
                                        red_page.page.title() )
            else:
                # If successfully parsed all pages in cat, flush db write cache
                redpage.RedPage.flush_db_cache()

    finally:
        jogobot.output( "END – parser-pages.py" )
        pywikibot.stopme()

if( __name__ == "__main__" ):
    main()