Add parse-pages.py Script

8 years ago · a24f208449
2 changed files with 107 additions and 2 deletions
--- a/parse-pages.py
+++ b/parse-pages.py
@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8  -*-
+#
+#  parse-pages.py
+#
+#  Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#  MA 02110-1301, USA.
+#
+#
+"""
+Script to parse all redpages in configured categories
+"""
+
+import pywikibot
+from pywikibot import pagegenerators
+
+import jogobot
+
+import redpage
+import redfam
+
+
+def get_cat_pages( cat ):
+    """
+    Generates a iteratable generator-object with all pages listet in given
+    category
+
+    @param  cat  Category to request
+    @type  cat  str
+
+    @returns  generator  Iteratable object with pages of given category
+    """
+
+    # Get site to work on from pywikibot config
+    site = pywikibot.Site()
+
+    # Retrieve the content of given category
+    category = pywikibot.Category( site, cat )
+
+    # Build an iteratable generator object with page objects for given category
+    generator = pagegenerators.CategorizedPageGenerator( category )
+
+    return generator
+
+
+def main(*args):
+    """
+    Handles process
+    """
+
+    try:
+        jogobot.output( "BEGINN – parser-pages.py" )
+
+        # Iterate over configured categories
+        for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
+
+            # Iterate over pages in current cat
+            for page in get_cat_pages( cat ):
+
+                # For pages configured to exclude, go on with next page
+                if page.title() in (
+                        jogobot.config["redundances"]["redpage_exclude"] ):
+
+                    continue
+
+                # Initiate RedPage object
+                red_page = redpage.RedPage( page )
+
+                # Check whether parsing is needed
+                if red_page.is_parsing_needed():
+
+                    # Iterate over returned generator with redfam sections
+                    for fam in red_page.parse():
+
+                        # Run RedFamParser on section text
+                        redfam.RedFamParser.parser( fam, red_page.page._pageid,
+                                                    red_page.is_archive() )
+                    else:
+                        # If successfully parsed whole page, flush
+                        # db write cache
+                        redfam.RedFamParser.flush_db_cache()
+                        jogobot.output( "Page '%s' parsed" %
+                                        red_page.page.title() )
+            else:
+                # If successfully parsed all pages in cat, flush db write cache
+                redpage.RedPage.flush_db_cache()
+
+    finally:
+        jogobot.output( "END – parser-pages.py" )
+        pywikibot.stopme()
+
+if( __name__ == "__main__" ):
+    main()
--- a/tox.ini
+++ b/tox.ini
@ -1,2 +0,0 @@
-[flake8]
-ignore = E129,E201,E202,W293