Files
jogobot-red/lib/redfam.py
Jonathan Golder 6e973369cd sqlalchemy working for parser
Needs some testing, presumably contains some bugs
2017-03-09 00:08:48 +01:00

928 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# redfam.py
#
# Copyright 2017 GOLDERWEB Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Provides classes for working with RedFams
"""
import hashlib
import locale
import re
from datetime import datetime
import mwparserfromhell as mwparser # noqa
import pywikibot # noqa
from pywikibot.tools import deprecated # noqa
import jogobot
#~ from lib.mysqlred import Column, Integer, String, Text, DateTime, ForeignKey, ColumnList, Status
from lib.mysqlred import MysqlRedFam, MutableSet, ColumnList #, Mysql, Base, relationship, composite,
class RedFam( MysqlRedFam ):
"""
Basic class for RedFams, containing the basic data structure
"""
def __init__( self, articlesList, beginning, ending=None, redpageid=None,
status=MutableSet(), famhash=None, heading=None ):
"""
Generates a new RedFam object
@param articlesList list List of articles
@param beginning datetime Beginning date
@param ending datetime Ending date
@param red_page_id int MW pageid of containing RedPage
@param status str Status of RedFam
@param fam_hash str SHA1 hash of articlesList
@param heading str Original heading of RedFam (Link)
"""
# Having pywikibot.Site() is a good idea most of the time
self.site = pywikibot.Site()
# Database interface
#self._mysql = MysqlRedFam( famhash )
# Initial attribute values
#~ self.articlesList = articlesList
#~ self.beginning = beginning
#~ self.ending = ending
#~ self.redpageid = redpageid
#~ # self._status = set()
#~ # self._status = self._parse_status(status)
#~ self.famhash = famhash
#~ self.heading = heading
#self.status = status
#articlesStatus = ColumnList([ MutableSet() for x in range(0,8) ])
#~ # Calculates the sha1 hash over self._articlesList to
#~ # rediscover known redundance families
#~ self.calc_famhash()
#~ if not status:
#~ status = MutableSet()
super().__init__( articlesList=articlesList, beginning=beginning, ending=ending, redpageid=redpageid,
famhash=famhash, heading=heading, status=status, articlesStatus=None )
#super().__init__()
def __repr__( self ):
"""
Returns repression str of RedFam object
@returns str repr() string
"""
__repr = "RedFam( " + \
"articlesList=" + repr( self.articlesList ) + \
", heading=" + repr( self.heading ) + \
", beginning=" + repr( self.beginning ) + \
", ending=" + repr( self.ending ) + \
", red_page_id=" + repr( self.redpageid ) + \
", status=" + repr( self.status ) + \
", fam_hash=" + repr( self.famhash ) + \
" )"
return __repr
@classmethod
def calc_famhash(cls, articlesList ):
h = hashlib.sha1()
# Since articlesList attr of RedFam will have always 8 Members we
# need to fill up smaller lists (longers will be cropped below).
while len( articlesList) < 8:
articlesList.append(None)
h.update( str( articlesList[:8] ).encode('utf-8') )
return h.hexdigest()
def c_famhash( self ):
"""
Calculates the SHA-1 hash for the articlesList of redundance family.
Since we don't need security SHA-1 is just fine.
@returns str String with the hexadecimal hash digest
"""
print( type( self ) )
if self.famhash and type(self).calc_famhash(self.articlesList) != self.famhash:
raise RedFamHashError( self.famhash, h.hexdigest() )
elif self.famhash:
return
else:
self.famhash = type(self).calc_famhash(self.articlesList)
#~ def changed( self ):
#~ """
#~ Checks wether anything has changed and maybe triggers db update
#~ """
#~ # On archived redfams do not delete possibly existing ending
#~ if( not self.ending and "archived" in self._status and
#~ self._mysql.data[ 'ending' ] ):
#~ self._ending = self._mysql.data[ 'ending' ]
#~ # Since status change means something has changed, update database
#~ if( self._raw_status != self._mysql.data[ 'status' ] or
#~ self._beginning != self._mysql.data[ 'beginning' ] or
#~ self._ending != self._mysql.data[ 'ending' ] or
#~ self._red_page_id != self._mysql.data[ 'redpageid' ] or
#~ self._heading != self._mysql.data[ 'heading' ]):
#~ self._mysql.update_fam( self._redpageid, self._heading,
#~ self._beginning, self._ending,
#~ self._raw_status() )
@classmethod
def flush_db_cache( cls ):
"""
Calls flush method of Mysql Interface class
"""
cls.session.commit()
#~ MysqlRedFam.flush()
def add_status(self, status):
"""
Adds a status specified by status, to status set
@param status Statusstring to add
@type status str
"""
self.status.add(status)
def remove_status(self, status, weak=True):
"""
Removes a status, specified by status from set. If weak is set to
False it will throw a KeyError when trying to remove a status not set.
@param status Statusstring to add
@type status str
@param weak Change behavior on missing status
@type bool
"""
if weak:
self.status.discard(status)
else:
self.status.remove(status)
def has_status(self, status):
"""
Returns True, if redfam has given status
@param status Statusstring to check
@type status str
@returns True if status is present else False
"""
if status in self.status:
return True
else:
return False
#~ def _parse_status(self, raw_status ):
#~ """
#~ Sets status based on comma separated list
#~ @param raw_status Commaseparated string of stati (from DB)
#~ @type raw_status str
#~ """
#~ self._status = set( raw_status.strip().split(","))
#~ def _raw_status( self ):
#~ """
#~ Returns status as commaseparated string (to save in DB)
#~ @returns Raw status string
#~ @rtype str
#~ """
#~ return ",".join( self._status )
def article_add_status(self, status, index=None, title=None ):
"""
Adds a status specified by status, to article (identified by title
or index in articlesList) status set
@param status Statusstring to add
@type status str
@param index Add to article with index in articlesList
@type index int
@param title Add to article with title in articlesList
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
self._article_status[index].add(status)
else:
raise IndexError( "No index given or wrong format!")
def article_remove_status(self, status, index=None, title=None, weak=True):
"""
Removes a status specified by status, from article (identified by title
or index in articlesList) status set
If weak is set to False it will throw a KeyError when trying to
remove a status not set.
@param status Statusstring to add
@type status str
@param index Remove from article with index in articlesList
@type index int
@param title Remove from article with title in articlesList
@type title str
@param weak Change behavior on missing status
@type bool
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
if weak:
self._article_status[index].discard(status)
else:
self._article_status[index].remove(status)
else:
raise IndexError( "No index given or wrong format!")
def article_has_status(self, status, index=None, title=None ):
"""
Adds a status specified by status, to articles (identified by title
or index in articlesList) status set
@param status Statusstring to add
@type status str
@param index Check article with index in articlesList
@type index int
@param title Check article with title in articlesList
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
if status in self._article_status[index]:
return True
else:
return False
else:
raise IndexError( "No index given or wrong format!")
def _article_parse_status(self, raw_status, index=None, title=None ):
"""
Sets status based on comma separated list to articles (identified by
title or index in articlesList) status set
@param status Statusstring to set
@type status str
@param index Add to article with index in articlesList
@type index int
@param title Add to article with title in articlesList
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
self._article_status[index] = set( raw_status.strip().split(","))
else:
raise IndexError( "No index given or wrong format!")
def _article_raw_status( self, index=None, title=None ):
"""
Returns status as commaseparated string (to save in DB) of article
(identified by title or index in articlesList) status set
@param index Get from article with index in articlesList
@type index int
@param title Get from article with title in articlesList
@type title str
@returns Raw status string
@rtype str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
return ",".join( self._article_status[index] )
else:
raise IndexError( "No index given or wrong format!")
class RedFamParser( RedFam ):
"""
Provides an interface to RedFam for adding/updating redundance families
while parsig redundance pages
"""
# Define the timestamp format
__timestamp_format = jogobot.config['redundances']['timestamp_format']
# Define section heading re.pattern
__sectionhead_pat = re.compile( r"^(.*\[\[.+\]\].*\[\[.+\]\].*)" )
# Define timestamp re.pattern
__timestamp_pat = re.compile( jogobot.config['redundances']
['timestamp_regex'] )
# Textpattern for recognisation of done-notices
__done_notice = ":<small>Archivierung dieses Abschnittes \
wurde gewünscht von:"
__done_notice2 = "{{Erledigt|"
def __init__( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None ):
"""
Creates a RedFam object based on data collected while parsing red_pages
combined with possibly former known data from db
@param redfam_heading str Wikitext heading of section
@param redpage page Pywikibot.page object
@param redpagearchive bool Is red_page an archive
@param beginning datetime Timestamp of beginning
str as strptime parseable string
@param ending datetime Timestamp of ending
str strptime parseable string
"""
# Parse the provided heading of redundance section
# to set self._articlesList
#~ self.heading = str(heading)
#~ self.articlesList = articlesList
#~ # Catch sections with more then 8 articles, print error
#~ if len( self.articlesList ) > 8:
#~ # For repression in output we need to know the fam hash
#~ self.calc_famhash()
#~ jogobot.output(
#~ ( "\03{{lightred}}" +
#~ "Maximum number of articles in red_fam exceeded, " +
#~ "maximum number is 8, {number:d} were given \n {repress}"
#~ ).format( datetime=datetime.now().strftime(
#~ "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
#~ repress=repr( self ) ),
#~ "WARNING" )
#~ # Only save the first 8 articles
#~ # self.articlesList = self.articlesList[:8]
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
famhash = type(self).calc_famhash(articlesList)
#~ obj = self.session.query(RedFamParser).filter(RedFamParser.famhash == self.famhash ).one_or_none()
#~ if obj:
#~ self = obj
# Set object attributes:
#~ self.redpageid = redpage._pageid
self._redpagearchive = redpagearchive
# self.famhash = None
# Method self.add_beginning sets self._beginning directly
#~ self.add_beginning( beginning )
#~ # Method self.add_ending sets self._ending directly
#~ if( ending ):
#~ self.add_ending( ending )
#~ else:
#~ # If no ending was provided set to None
#~ self.ending = None
#~ self.status = MutableSet()
beginning = self.__datetime(beginning)
if ending:
ending = self.__datetime(ending)
super().__init__( articlesList, beginning, ending=ending, redpageid=redpage._pageid,
famhash=famhash, heading=heading )
# Check status changes
self.check_status()
self.session.add(self)
# Open database connection, ask for data if existing,
# otherwise create entry
# self.__handle_db()
# Triggers db update if anything changed
# self.changed()
#~ def __handle_db( self ):
#~ """
#~ Handles opening of db connection
#~ """
#~ # We need a connection to our mysqldb
#~ self._mysql = MysqlRedFam( )
#~ self._mysql.get_fam( self._famhash )
#~ if not self._mysql.data:
#~ self._mysql.add_fam( self._articlesList, self._heading,
#~ self._redpageid, self._beginning,
#~ self._ending )
def update( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None):
self.articlesList = articlesList;
self.heading = heading;
self.redpage = redpage;
self.redpageid = redpage.pageid;
self.add_beginning( beginning )
if( ending ):
self.add_ending( ending )
self._redpagearchive = redpagearchive
# Check status changes
self.check_status()
@classmethod
def heading_parser( cls, heading ):
"""
Parses given red_fam_heading string and saves articles list
@param heading Heading of RedFam-Section
@type heading wikicode or mwparser-parseable
"""
# Parse string heading with mwparse again everytime
# In some cases the given wikicode is broken due to syntax errors
# (Task FS#77)
heading = mwparser.parse( str( heading ) )
# Save destinations of wikilinks in headings
return [ str( link.title ) for link
in heading.ifilter_wikilinks() ]
def add_beginning( self, beginning ):
"""
Adds the beginning date of a redundance diskussion to the object
@param datetime datetime Beginning date
"""
self.beginning = self.__datetime( beginning )
def add_ending( self, ending ):
"""
Adds the ending date of a redundance diskussion to the object.
@param datetime datetime Ending date
"""
self.ending = self.__datetime( ending )
def __datetime( self, timestamp ):
"""
Decides wether given timestamp is a parseable string or a
datetime object and returns a datetime object in both cases
@param datetime timestamp Datetime object
str timestamp Parseable string with timestamp
@returns datetime Datetime object
"""
# Make sure locale is set to 'de_DE.UTF-8' to prevent problems
# with wrong month abreviations in strptime
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
if( isinstance( timestamp, datetime ) ):
return timestamp
else:
result = datetime.strptime( timestamp,
type( self ).__timestamp_format )
return result
def check_status( self ):
"""
Handles detection of correct status
There are three possible stati:
- 0 Discussion running --> no ending, page is not an archive
- 1 Discussion over --> ending present, page is not an archive
- 2 Discussion archived --> ending (normaly) present, page is archive
- 3 and greater status was set by worker script, do not change it
"""
# No ending, discussion is running:
# Sometimes archived discussions also have no detectable ending
if not self.ending and not self._redpagearchive:
self.add_status("open")
else:
self.remove_status("open")
if not self._redpagearchive:
self.add_status("done")
else:
self.remove_status("done")
self.remove_status("open")
self.add_status("archived")
@classmethod
def is_section_redfam_cb( cls, heading ):
"""
Used as callback for wikicode.get_sections in redpage.parse to
select sections which are redfams
"""
# Because of strange behavior in some cases, parse heading again
# (Task FS#77)
heading = mwparser.parse( str( heading ) )
# Make sure we have min. two wikilinks in heading to assume a redfam
if len( heading.filter_wikilinks() ) >= 2:
return True
else:
return False
@classmethod
def parser( cls, text, redpage, isarchive=False ):
"""
Handles parsing of redfam section
@param text Text of RedFam-Section
@type text wikicode or mwparser-parseable
"""
# Parse heading with mwparse if needed
if not isinstance( text, mwparser.wikicode.Wikicode ):
text = mwparser.parse( text )
# Extract heading text
heading = next( text.ifilter_headings() ).title
# Extract beginnig and maybe ending
(beginning, ending) = RedFamParser.extract_dates( text, isarchive )
# Missing beginning (Task: FS#76)
# Use first day of month of reddisc
if not beginning:
match = re.search(
jogobot.config["redundances"]["reddiscs_onlyinclude_re"],
redpage.page.title() )
if match:
beginning = datetime.strptime(
"01. {month} {year}".format(
month=match.group(1), year=match.group(2)),
"%d. %B %Y" )
articlesList = RedFamParser.heading_parser( heading )
famhash = RedFamParser.calc_famhash( articlesList )
# Check for existing objects in DB first in current redpage
redfam = redpage.redfams.get(famhash)
with RedFamParser.session.no_autoflush:
if not redfam:
# Otherwise in db table
redfam = RedFamParser.session.query(RedFamParser).filter(
RedFamParser.famhash == famhash ).one_or_none()
if redfam:
# Existing redfams need to be updated
redfam.update( articlesList, str(heading), redpage, isarchive, beginning, ending )
else:
# Create the RedFam object
redfam = RedFamParser( articlesList, str(heading).strip(), redpage.page, isarchive, beginning, ending )
return redfam
@classmethod
def extract_dates( cls, text, isarchive=False ):
"""
Returns tuple of the first and maybe last timestamp of a section.
Last timestamp is only returned if there is a done notice or param
*isarchiv* is set to 'True'
@param text Text to search in
@type line Any Type castable to str
@param isarchive If true skip searching done notice (on archivepages)
@type isarchive bool
@returns Timestamps, otherwise None
@returntype tuple of strs
"""
# Match all timestamps
matches = cls.__timestamp_pat.findall( str( text ) )
if matches:
# First one is beginning
# Since some timestamps are broken we need to reconstruct them
# by regex match groups
beginning = ( matches[0][0] + ", " + matches[0][1] + ". " +
matches[0][2] + ". " + matches[0][3] )
# Last one maybe is ending
# Done notice format 1
# Done notice format 2
# Or on archivepages
if ( cls.__done_notice in text or
cls.__done_notice2 in text or
isarchive ):
ending = ( matches[-1][0] + ", " + matches[-1][1] + ". " +
matches[-1][2] + ". " + matches[-1][3] )
else:
ending = None
# Missing dates (Task: FS#76)
else:
beginning = None
ending = None
return (beginning, ending)
class RedFamWorker( RedFam ):
"""
Handles working with redundance families stored in database
where discussion is finished
"""
def __init__( self, mysql_data ):
articlesList = []
for key in sorted( mysql_data.keys() ):
if 'article' in key and 'status' not in key and mysql_data[ key ]:
articlesList.append( mysql_data[ key ] )
# Preset article status list with empty sets for existing articles
self._article_status = [set() for x in range(0, len(articlesList))]
super().__init__( articlesList, mysql_data[ 'beginning' ],
mysql_data[ 'ending' ], mysql_data[ 'redpageid' ],
mysql_data[ 'status' ], mysql_data[ 'famhash' ],
mysql_data[ 'heading' ] )
# #~ self._mysql.data = mysql_data
#~ # Set up article status
#~ index = 0
#~ for article in self.articlesList:
#~ raw_status = mysql_data[ "article" + str(index) + "_status" ]
#~ if not raw_status:
#~ raw_status = str()
#~ self._article_parse_status( raw_status, index )
#~ index += 1
# Get related RedPage-Information
self.redpageid = mysql_data[ 'pageid' ]
self.redpagetitle = mysql_data[ 'pagetitle' ]
# Make sure locale is set to 'de_DE.UTF-8' to prevent problems
# with wrong month abreviations in strptime
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
def article_generator(self, filter_existing=None, filter_redirects=None,
exclude_article_status=[],
onlyinclude_article_status=[] ):
"""
Yields pywikibot pageobjects for articles belonging to this redfams
in a generator
self.
@param filter_existing Set to True to only get existing pages
set to False to only get nonexisting pages
unset/None results in not filtering
@type filter_existing bool/None
@param filter_redirects Set to True to get only noredirectpages,
set to False to get only redirectpages,
unset/None results in not filtering
@type filter_redirects bool/None
"""
# Iterate over articles in redfam
for article in self._articlesList:
page = pywikibot.Page(pywikibot.Link(article), self.site)
# Exclude by article status
for status in exclude_article_status:
if self.article_has_status( status, title=article ):
continue
# Only include by article status
for status in onlyinclude_article_status:
if not self.article_has_status( status, title=article ):
continue
# Filter non existing Pages if requested with filter_existing=True
if filter_existing and not page.exists():
continue
# Filter existing pages if requested with filter_existing=False
elif filter_existing is False and page.exists():
continue
# Filter redirects if requested with filter_redirects=True
if filter_redirects and page.isRedirectPage():
continue
# Filter noredirects if requested with filter_redirects=False
elif filter_redirects is False and not page.isRedirectPage():
continue
# Yield filtered pages
yield page
def update_status( self ):
"""
Sets status to 3 when worked on
"""
for article in self._articlesList:
if self.article_has_status( "note_rej", title=article ):
self.add_status( "note_rej" )
if self.article_has_status( "sav_err", title=article ):
self.add_status( "sav_err" )
if not self.has_status( "sav_err" ) and \
not self.has_status( "note_rej" ):
self.add_status( "marked" )
self._mysql.data[ 'status' ] = self._raw_status()
index = 0
for article in self._articlesList:
self._mysql.data[ "article" + str(index) + 'status' ] = \
self._article_raw_status( index=index )
index += 1
print( repr(self) )
def get_disc_link( self ):
"""
Constructs and returns the link to Redundancy discussion
@returns Link to diskussion
@rtype str
"""
# We need to Replace Links with their linktext
anchor_code = mwparser.parse( self._mysql.data[ 'heading' ].strip() )
for link in anchor_code.ifilter_wikilinks():
if link.text:
text = link.text
else:
text = link.title
anchor_code.replace( link, text )
# Whitespace is replaced with underscores
anchor_code.replace( " ", "_" )
# We try it with out any more parsing as mw will do while parsing page
return ( self.redpagetitle + "#" +
str(anchor_code).strip() )
def generate_disc_notice_template( self ):
"""
Generates notice template to add on discussion Pages of Articles when
redundancy discussion is finished
@return Notice template to add on article disc
@rtype wikicode-node
"""
# Generate template boilerplate
template = mwparser.nodes.template.Template(
jogobot.config['redundances']['disc_notice_template_name'])
# Index of first article's param
param_cnt = 3
# Iterate over articles in redfam
for article in self._articlesList:
# Make sure to only use 8 articles (max. param 10)
if param_cnt > 10:
break
# Add param for article
template.add( param_cnt, article, True )
param_cnt += 1
# Add begin
begin = self._mysql.data[ 'beginning' ].strftime( "%B %Y" )
template.add( "Beginn", begin, True )
# Add end (if not same as begin)
end = self._mysql.data[ 'ending' ].strftime( "%B %Y" )
if not end == begin:
template.add( "Ende", end, True )
# Add link to related reddisc
template.add( "Diskussion", self.get_disc_link(), True )
# Add signature and timestamp
# Not used atm
# template.add( 1, "-- ~~~~", True )
return template
@classmethod
def list_by_status( cls, status ):
"""
Lists red_fams stored in db by given status
"""
mysql = MysqlRedFam()
for fam in mysql.get_by_status( status ):
try:
print( cls( fam ) )
except RedFamHashError:
print(fam)
raise
@classmethod
def gen_by_status_and_ending( cls, status, ending ):
"""
Yield red_fams stored in db by given status which have an ending after
given one
"""
mysql = MysqlRedFam()
for fam in mysql.get_by_status_and_ending( status, ending ):
try:
yield cls( fam )
except RedFamHashError:
print(fam)
raise
class RedFamError( Exception ):
"""
Base class for all Errors of RedFam-Module
"""
def __init__( self, message=None ):
"""
Handles Instantiation of RedFamError's
"""
if not message:
self.message = "An Error occured while executing a RedFam action"
else:
self.message = message
def __str__( self ):
"""
Output of error message
"""
return self.message
class RedFamHashError( RedFamError ):
"""
Raised when given RedFamHash does not match with calculated
"""
def __init__( self, givenHash, calculatedHash ):
message = "Given fam_hash ('{given}') does not match with \
calculated ('{calc}'".format( given=givenHash, calc=calculatedHash )
super().__init__( message )
class RedFamHeadingError ( RedFamError ):
"""
Raised when given RedFamHeading does not match __sectionhead_pat Regex
"""
def __init__( self, heading ):
message = "Error while trying to parse section heading. Given heading \
'{heading}' does not match RegEx".format( heading=heading )
super().__init__( message )