Jonathan Golder
8 years ago
2 changed files with 107 additions and 2 deletions
@ -0,0 +1,107 @@ |
|||||
|
#!/usr/bin/env python3 |
||||
|
# -*- coding: utf-8 -*- |
||||
|
# |
||||
|
# parse-pages.py |
||||
|
# |
||||
|
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de> |
||||
|
# |
||||
|
# This program is free software; you can redistribute it and/or modify |
||||
|
# it under the terms of the GNU General Public License as published by |
||||
|
# the Free Software Foundation; either version 2 of the License, or |
||||
|
# (at your option) any later version. |
||||
|
# |
||||
|
# This program is distributed in the hope that it will be useful, |
||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
|
# GNU General Public License for more details. |
||||
|
# |
||||
|
# You should have received a copy of the GNU General Public License |
||||
|
# along with this program; if not, write to the Free Software |
||||
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
||||
|
# MA 02110-1301, USA. |
||||
|
# |
||||
|
# |
||||
|
""" |
||||
|
Script to parse all redpages in configured categories |
||||
|
""" |
||||
|
|
||||
|
import pywikibot |
||||
|
from pywikibot import pagegenerators |
||||
|
|
||||
|
import jogobot |
||||
|
|
||||
|
import redpage |
||||
|
import redfam |
||||
|
|
||||
|
|
||||
|
def get_cat_pages( cat ): |
||||
|
""" |
||||
|
Generates a iteratable generator-object with all pages listet in given |
||||
|
category |
||||
|
|
||||
|
@param cat Category to request |
||||
|
@type cat str |
||||
|
|
||||
|
@returns generator Iteratable object with pages of given category |
||||
|
""" |
||||
|
|
||||
|
# Get site to work on from pywikibot config |
||||
|
site = pywikibot.Site() |
||||
|
|
||||
|
# Retrieve the content of given category |
||||
|
category = pywikibot.Category( site, cat ) |
||||
|
|
||||
|
# Build an iteratable generator object with page objects for given category |
||||
|
generator = pagegenerators.CategorizedPageGenerator( category ) |
||||
|
|
||||
|
return generator |
||||
|
|
||||
|
|
||||
|
def main(*args): |
||||
|
""" |
||||
|
Handles process |
||||
|
""" |
||||
|
|
||||
|
try: |
||||
|
jogobot.output( "BEGINN – parser-pages.py" ) |
||||
|
|
||||
|
# Iterate over configured categories |
||||
|
for cat in ( jogobot.config["redundances"]["redpage_cats"] ): |
||||
|
|
||||
|
# Iterate over pages in current cat |
||||
|
for page in get_cat_pages( cat ): |
||||
|
|
||||
|
# For pages configured to exclude, go on with next page |
||||
|
if page.title() in ( |
||||
|
jogobot.config["redundances"]["redpage_exclude"] ): |
||||
|
|
||||
|
continue |
||||
|
|
||||
|
# Initiate RedPage object |
||||
|
red_page = redpage.RedPage( page ) |
||||
|
|
||||
|
# Check whether parsing is needed |
||||
|
if red_page.is_parsing_needed(): |
||||
|
|
||||
|
# Iterate over returned generator with redfam sections |
||||
|
for fam in red_page.parse(): |
||||
|
|
||||
|
# Run RedFamParser on section text |
||||
|
redfam.RedFamParser.parser( fam, red_page.page._pageid, |
||||
|
red_page.is_archive() ) |
||||
|
else: |
||||
|
# If successfully parsed whole page, flush |
||||
|
# db write cache |
||||
|
redfam.RedFamParser.flush_db_cache() |
||||
|
jogobot.output( "Page '%s' parsed" % |
||||
|
red_page.page.title() ) |
||||
|
else: |
||||
|
# If successfully parsed all pages in cat, flush db write cache |
||||
|
redpage.RedPage.flush_db_cache() |
||||
|
|
||||
|
finally: |
||||
|
jogobot.output( "END – parser-pages.py" ) |
||||
|
pywikibot.stopme() |
||||
|
|
||||
|
if( __name__ == "__main__" ): |
||||
|
main() |
@ -1,2 +0,0 @@ |
|||||
[flake8] |
|
||||
ignore = E129,E201,E202,W293 |
|
Loading…
Reference in new issue