Browse Source

Add parse-pages.py Script

develop
Jonathan Golder 8 years ago
parent
commit
a24f208449
  1. 107
      parse-pages.py
  2. 2
      tox.ini

107
parse-pages.py

@ -0,0 +1,107 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# parse-pages.py
#
# Copyright 2016 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Script to parse all redpages in configured categories
"""
import pywikibot
from pywikibot import pagegenerators
import jogobot
import redpage
import redfam
def get_cat_pages( cat ):
"""
Generates a iteratable generator-object with all pages listet in given
category
@param cat Category to request
@type cat str
@returns generator Iteratable object with pages of given category
"""
# Get site to work on from pywikibot config
site = pywikibot.Site()
# Retrieve the content of given category
category = pywikibot.Category( site, cat )
# Build an iteratable generator object with page objects for given category
generator = pagegenerators.CategorizedPageGenerator( category )
return generator
def main(*args):
"""
Handles process
"""
try:
jogobot.output( "BEGINN – parser-pages.py" )
# Iterate over configured categories
for cat in ( jogobot.config["redundances"]["redpage_cats"] ):
# Iterate over pages in current cat
for page in get_cat_pages( cat ):
# For pages configured to exclude, go on with next page
if page.title() in (
jogobot.config["redundances"]["redpage_exclude"] ):
continue
# Initiate RedPage object
red_page = redpage.RedPage( page )
# Check whether parsing is needed
if red_page.is_parsing_needed():
# Iterate over returned generator with redfam sections
for fam in red_page.parse():
# Run RedFamParser on section text
redfam.RedFamParser.parser( fam, red_page.page._pageid,
red_page.is_archive() )
else:
# If successfully parsed whole page, flush
# db write cache
redfam.RedFamParser.flush_db_cache()
jogobot.output( "Page '%s' parsed" %
red_page.page.title() )
else:
# If successfully parsed all pages in cat, flush db write cache
redpage.RedPage.flush_db_cache()
finally:
jogobot.output( "END – parser-pages.py" )
pywikibot.stopme()
if( __name__ == "__main__" ):
main()

2
tox.ini

@ -1,2 +0,0 @@
[flake8]
ignore = E129,E201,E202,W293
Loading…
Cancel
Save