Add parse-pages.py Script
This commit is contained in:
107
parse-pages.py
Normal file
107
parse-pages.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# parse-pages.py
|
||||
#
|
||||
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||
# MA 02110-1301, USA.
|
||||
#
|
||||
#
|
||||
"""
|
||||
Script to parse all redpages in configured categories
|
||||
"""
|
||||
|
||||
import pywikibot
|
||||
from pywikibot import pagegenerators
|
||||
|
||||
import jogobot
|
||||
|
||||
import redpage
|
||||
import redfam
|
||||
|
||||
|
||||
def get_cat_pages( cat ):
|
||||
"""
|
||||
Generates a iteratable generator-object with all pages listet in given
|
||||
category
|
||||
|
||||
@param cat Category to request
|
||||
@type cat str
|
||||
|
||||
@returns generator Iteratable object with pages of given category
|
||||
"""
|
||||
|
||||
# Get site to work on from pywikibot config
|
||||
site = pywikibot.Site()
|
||||
|
||||
# Retrieve the content of given category
|
||||
category = pywikibot.Category( site, cat )
|
||||
|
||||
# Build an iteratable generator object with page objects for given category
|
||||
generator = pagegenerators.CategorizedPageGenerator( category )
|
||||
|
||||
return generator
|
||||
|
||||
|
||||
def main(*args):
|
||||
"""
|
||||
Handles process
|
||||
"""
|
||||
|
||||
try:
|
||||
jogobot.output( "BEGINN – parser-pages.py" )
|
||||
|
||||
# Iterate over configured categories
|
||||
for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
|
||||
|
||||
# Iterate over pages in current cat
|
||||
for page in get_cat_pages( cat ):
|
||||
|
||||
# For pages configured to exclude, go on with next page
|
||||
if page.title() in (
|
||||
jogobot.config["redundances"]["redpage_exclude"] ):
|
||||
|
||||
continue
|
||||
|
||||
# Initiate RedPage object
|
||||
red_page = redpage.RedPage( page )
|
||||
|
||||
# Check whether parsing is needed
|
||||
if red_page.is_parsing_needed():
|
||||
|
||||
# Iterate over returned generator with redfam sections
|
||||
for fam in red_page.parse():
|
||||
|
||||
# Run RedFamParser on section text
|
||||
redfam.RedFamParser.parser( fam, red_page.page._pageid,
|
||||
red_page.is_archive() )
|
||||
else:
|
||||
# If successfully parsed whole page, flush
|
||||
# db write cache
|
||||
redfam.RedFamParser.flush_db_cache()
|
||||
jogobot.output( "Page '%s' parsed" %
|
||||
red_page.page.title() )
|
||||
else:
|
||||
# If successfully parsed all pages in cat, flush db write cache
|
||||
redpage.RedPage.flush_db_cache()
|
||||
|
||||
finally:
|
||||
jogobot.output( "END – parser-pages.py" )
|
||||
pywikibot.stopme()
|
||||
|
||||
if( __name__ == "__main__" ):
|
||||
main()
|
||||
Reference in New Issue
Block a user