Files
jogobot-red/reddiscparser.py
GOLDERWEB – Jonathan Golder a8605bcee6 Mv pages-parser.py to reddiscparser.py
New, more meaningfull naming conventions, from redpage to reddisc (page)

Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=72 FS#72]
2016-08-23 21:50:22 +02:00

108 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# parse-pages.py
#
# Copyright 2016 GOLDERWEB Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Script to parse all redpages in configured categories
"""
import pywikibot
from pywikibot import pagegenerators
import jogobot
import redpage
import redfam
def get_cat_pages( cat ):
"""
Generates a iteratable generator-object with all pages listet in given
category
@param cat Category to request
@type cat str
@returns generator Iteratable object with pages of given category
"""
# Get site to work on from pywikibot config
site = pywikibot.Site()
# Retrieve the content of given category
category = pywikibot.Category( site, cat )
# Build an iteratable generator object with page objects for given category
generator = pagegenerators.CategorizedPageGenerator( category )
return generator
def main(*args):
"""
Handles process
"""
try:
jogobot.output( "BEGINN parser-pages.py" )
# Iterate over configured categories
for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
# Iterate over pages in current cat
for page in get_cat_pages( cat ):
# For pages configured to exclude, go on with next page
if page.title() in (
jogobot.config["redundances"]["redpage_exclude"] ):
continue
# Initiate RedPage object
red_page = redpage.RedPage( page )
# Check whether parsing is needed
if red_page.is_parsing_needed():
# Iterate over returned generator with redfam sections
for fam in red_page.parse():
# Run RedFamParser on section text
redfam.RedFamParser.parser( fam, red_page.page._pageid,
red_page.is_archive() )
else:
# If successfully parsed whole page, flush
# db write cache
redfam.RedFamParser.flush_db_cache()
jogobot.output( "Page '%s' parsed" %
red_page.page.title() )
else:
# If successfully parsed all pages in cat, flush db write cache
redpage.RedPage.flush_db_cache()
finally:
jogobot.output( "END parser-pages.py" )
pywikibot.stopme()
if( __name__ == "__main__" ):
main()