You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
3.2 KiB
107 lines
3.2 KiB
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# parse-pages.py
|
|
#
|
|
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
# MA 02110-1301, USA.
|
|
#
|
|
#
|
|
"""
|
|
Script to parse all redpages in configured categories
|
|
"""
|
|
|
|
import pywikibot
|
|
from pywikibot import pagegenerators
|
|
|
|
import jogobot
|
|
|
|
import redpage
|
|
import redfam
|
|
|
|
|
|
def get_cat_pages( cat ):
|
|
"""
|
|
Generates a iteratable generator-object with all pages listet in given
|
|
category
|
|
|
|
@param cat Category to request
|
|
@type cat str
|
|
|
|
@returns generator Iteratable object with pages of given category
|
|
"""
|
|
|
|
# Get site to work on from pywikibot config
|
|
site = pywikibot.Site()
|
|
|
|
# Retrieve the content of given category
|
|
category = pywikibot.Category( site, cat )
|
|
|
|
# Build an iteratable generator object with page objects for given category
|
|
generator = pagegenerators.CategorizedPageGenerator( category )
|
|
|
|
return generator
|
|
|
|
|
|
def main(*args):
|
|
"""
|
|
Handles process
|
|
"""
|
|
|
|
try:
|
|
jogobot.output( "BEGINN – parser-pages.py" )
|
|
|
|
# Iterate over configured categories
|
|
for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
|
|
|
|
# Iterate over pages in current cat
|
|
for page in get_cat_pages( cat ):
|
|
|
|
# For pages configured to exclude, go on with next page
|
|
if page.title() in (
|
|
jogobot.config["redundances"]["redpage_exclude"] ):
|
|
|
|
continue
|
|
|
|
# Initiate RedPage object
|
|
red_page = redpage.RedPage( page )
|
|
|
|
# Check whether parsing is needed
|
|
if red_page.is_parsing_needed():
|
|
|
|
# Iterate over returned generator with redfam sections
|
|
for fam in red_page.parse():
|
|
|
|
# Run RedFamParser on section text
|
|
redfam.RedFamParser.parser( fam, red_page.page._pageid,
|
|
red_page.is_archive() )
|
|
else:
|
|
# If successfully parsed whole page, flush
|
|
# db write cache
|
|
redfam.RedFamParser.flush_db_cache()
|
|
jogobot.output( "Page '%s' parsed" %
|
|
red_page.page.title() )
|
|
else:
|
|
# If successfully parsed all pages in cat, flush db write cache
|
|
redpage.RedPage.flush_db_cache()
|
|
|
|
finally:
|
|
jogobot.output( "END – parser-pages.py" )
|
|
pywikibot.stopme()
|
|
|
|
if( __name__ == "__main__" ):
|
|
main()
|
|
|