Jonathan Golder
8 years ago
2 changed files with 107 additions and 2 deletions
@ -0,0 +1,107 @@ |
|||
#!/usr/bin/env python3 |
|||
# -*- coding: utf-8 -*- |
|||
# |
|||
# parse-pages.py |
|||
# |
|||
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de> |
|||
# |
|||
# This program is free software; you can redistribute it and/or modify |
|||
# it under the terms of the GNU General Public License as published by |
|||
# the Free Software Foundation; either version 2 of the License, or |
|||
# (at your option) any later version. |
|||
# |
|||
# This program is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
# GNU General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU General Public License |
|||
# along with this program; if not, write to the Free Software |
|||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|||
# MA 02110-1301, USA. |
|||
# |
|||
# |
|||
""" |
|||
Script to parse all redpages in configured categories |
|||
""" |
|||
|
|||
import pywikibot |
|||
from pywikibot import pagegenerators |
|||
|
|||
import jogobot |
|||
|
|||
import redpage |
|||
import redfam |
|||
|
|||
|
|||
def get_cat_pages( cat ): |
|||
""" |
|||
Generates a iteratable generator-object with all pages listet in given |
|||
category |
|||
|
|||
@param cat Category to request |
|||
@type cat str |
|||
|
|||
@returns generator Iteratable object with pages of given category |
|||
""" |
|||
|
|||
# Get site to work on from pywikibot config |
|||
site = pywikibot.Site() |
|||
|
|||
# Retrieve the content of given category |
|||
category = pywikibot.Category( site, cat ) |
|||
|
|||
# Build an iteratable generator object with page objects for given category |
|||
generator = pagegenerators.CategorizedPageGenerator( category ) |
|||
|
|||
return generator |
|||
|
|||
|
|||
def main(*args): |
|||
""" |
|||
Handles process |
|||
""" |
|||
|
|||
try: |
|||
jogobot.output( "BEGINN – parser-pages.py" ) |
|||
|
|||
# Iterate over configured categories |
|||
for cat in ( jogobot.config["redundances"]["redpage_cats"] ): |
|||
|
|||
# Iterate over pages in current cat |
|||
for page in get_cat_pages( cat ): |
|||
|
|||
# For pages configured to exclude, go on with next page |
|||
if page.title() in ( |
|||
jogobot.config["redundances"]["redpage_exclude"] ): |
|||
|
|||
continue |
|||
|
|||
# Initiate RedPage object |
|||
red_page = redpage.RedPage( page ) |
|||
|
|||
# Check whether parsing is needed |
|||
if red_page.is_parsing_needed(): |
|||
|
|||
# Iterate over returned generator with redfam sections |
|||
for fam in red_page.parse(): |
|||
|
|||
# Run RedFamParser on section text |
|||
redfam.RedFamParser.parser( fam, red_page.page._pageid, |
|||
red_page.is_archive() ) |
|||
else: |
|||
# If successfully parsed whole page, flush |
|||
# db write cache |
|||
redfam.RedFamParser.flush_db_cache() |
|||
jogobot.output( "Page '%s' parsed" % |
|||
red_page.page.title() ) |
|||
else: |
|||
# If successfully parsed all pages in cat, flush db write cache |
|||
redpage.RedPage.flush_db_cache() |
|||
|
|||
finally: |
|||
jogobot.output( "END – parser-pages.py" ) |
|||
pywikibot.stopme() |
|||
|
|||
if( __name__ == "__main__" ): |
|||
main() |
@ -1,2 +0,0 @@ |
|||
[flake8] |
|||
ignore = E129,E201,E202,W293 |
Loading…
Reference in new issue