Add parse-pages.py Script

2016-03-03 21:05:16 +01:00
parent 0af7eb11d6
commit a24f208449
2 changed files with 107 additions and 2 deletions
--- a/parse-pages.py
+++ b/parse-pages.py
@@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8  -*-
 #
 #  parse-pages.py
 #
 #  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #
 #
 """
 Script to parse all redpages in configured categories
 """
 import pywikibot
 from pywikibot import pagegenerators
 import jogobot
 import redpage
 import redfam
 def get_cat_pages( cat ):
    """
    Generates a iteratable generator-object with all pages listet in given
    category
    @param  cat  Category to request
    @type  cat  str
    @returns  generator  Iteratable object with pages of given category
    """
    # Get site to work on from pywikibot config
    site = pywikibot.Site()
    # Retrieve the content of given category
    category = pywikibot.Category( site, cat )
    # Build an iteratable generator object with page objects for given category
    generator = pagegenerators.CategorizedPageGenerator( category )
    return generator
 def main(*args):
    """
    Handles process
    """
    try:
        jogobot.output( "BEGINN – parser-pages.py" )
        # Iterate over configured categories
        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
            # Iterate over pages in current cat
            for page in get_cat_pages( cat ):
                # For pages configured to exclude, go on with next page
                if page.title() in (
                        jogobot.config["redundances"]["redpage_exclude"] ):
                    continue
                # Initiate RedPage object
                red_page = redpage.RedPage( page )
                # Check whether parsing is needed
                if red_page.is_parsing_needed():
                    # Iterate over returned generator with redfam sections
                    for fam in red_page.parse():
                        # Run RedFamParser on section text
                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
                                                    red_page.is_archive() )
                    else:
                        # If successfully parsed whole page, flush
                        # db write cache
                        redfam.RedFamParser.flush_db_cache()
                        jogobot.output( "Page '%s' parsed" %
                                        red_page.page.title() )
            else:
                # If successfully parsed all pages in cat, flush db write cache
                redpage.RedPage.flush_db_cache()
    finally:
        jogobot.output( "END – parser-pages.py" )
        pywikibot.stopme()
 if( __name__ == "__main__" ):
    main()
--- a/tox.ini
+++ b/tox.ini
@@ -1,2 +0,0 @@
 [flake8]
 ignore = E129,E201,E202,W293