Add parse-pages.py Script
This commit is contained in:
107
parse-pages.py
Normal file
107
parse-pages.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# parse-pages.py
|
||||||
|
#
|
||||||
|
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||||
|
# MA 02110-1301, USA.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
"""
|
||||||
|
Script to parse all redpages in configured categories
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pywikibot
|
||||||
|
from pywikibot import pagegenerators
|
||||||
|
|
||||||
|
import jogobot
|
||||||
|
|
||||||
|
import redpage
|
||||||
|
import redfam
|
||||||
|
|
||||||
|
|
||||||
|
def get_cat_pages( cat ):
|
||||||
|
"""
|
||||||
|
Generates a iteratable generator-object with all pages listet in given
|
||||||
|
category
|
||||||
|
|
||||||
|
@param cat Category to request
|
||||||
|
@type cat str
|
||||||
|
|
||||||
|
@returns generator Iteratable object with pages of given category
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get site to work on from pywikibot config
|
||||||
|
site = pywikibot.Site()
|
||||||
|
|
||||||
|
# Retrieve the content of given category
|
||||||
|
category = pywikibot.Category( site, cat )
|
||||||
|
|
||||||
|
# Build an iteratable generator object with page objects for given category
|
||||||
|
generator = pagegenerators.CategorizedPageGenerator( category )
|
||||||
|
|
||||||
|
return generator
|
||||||
|
|
||||||
|
|
||||||
|
def main(*args):
|
||||||
|
"""
|
||||||
|
Handles process
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
jogobot.output( "BEGINN – parser-pages.py" )
|
||||||
|
|
||||||
|
# Iterate over configured categories
|
||||||
|
for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
|
||||||
|
|
||||||
|
# Iterate over pages in current cat
|
||||||
|
for page in get_cat_pages( cat ):
|
||||||
|
|
||||||
|
# For pages configured to exclude, go on with next page
|
||||||
|
if page.title() in (
|
||||||
|
jogobot.config["redundances"]["redpage_exclude"] ):
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Initiate RedPage object
|
||||||
|
red_page = redpage.RedPage( page )
|
||||||
|
|
||||||
|
# Check whether parsing is needed
|
||||||
|
if red_page.is_parsing_needed():
|
||||||
|
|
||||||
|
# Iterate over returned generator with redfam sections
|
||||||
|
for fam in red_page.parse():
|
||||||
|
|
||||||
|
# Run RedFamParser on section text
|
||||||
|
redfam.RedFamParser.parser( fam, red_page.page._pageid,
|
||||||
|
red_page.is_archive() )
|
||||||
|
else:
|
||||||
|
# If successfully parsed whole page, flush
|
||||||
|
# db write cache
|
||||||
|
redfam.RedFamParser.flush_db_cache()
|
||||||
|
jogobot.output( "Page '%s' parsed" %
|
||||||
|
red_page.page.title() )
|
||||||
|
else:
|
||||||
|
# If successfully parsed all pages in cat, flush db write cache
|
||||||
|
redpage.RedPage.flush_db_cache()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
jogobot.output( "END – parser-pages.py" )
|
||||||
|
pywikibot.stopme()
|
||||||
|
|
||||||
|
if( __name__ == "__main__" ):
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user