Browse Source

Merge branch 'fs#95-sqlalchemy' into fs#88-mark-pages-bot

develop
Jonathan Golder 7 years ago
parent
commit
ec7880207b
  1. 4
      bots/markpages.py
  2. 30
      bots/reddiscparser.py
  3. 500
      lib/mysqlred.py
  4. 482
      lib/redfam.py
  5. 151
      lib/redpage.py

4
bots/markpages.py

@ -87,6 +87,8 @@ class MarkPagesBot( CurrentPageBot ): # sets 'current_page' on each treat()
for redfam in self.redfams:
redfam.update_status()
RedFamWorker.flush_db_cache()
@property
def redfams(self):
"""
@ -168,7 +170,7 @@ class MarkPagesBot( CurrentPageBot ): # sets 'current_page' on each treat()
save_ret = self.put_current( self.new_text, summary=summary )
# Status
if add_ret is None or add_ret and save_ret:
if add_ret is None or ( add_ret and save_ret ):
self.current_page.redfam.article_add_status(
"marked",
title=self.current_page.title(withNamespace=False))

30
bots/reddiscparser.py

@ -33,8 +33,8 @@ from pywikibot.bot import ExistingPageBot, NoRedirectPageBot
import jogobot
from lib import redpage
from lib import redfam
from lib.redpage import RedPage
from lib.redfam import RedFamParser
class DiscussionParserBot(
@ -127,7 +127,7 @@ class DiscussionParserBot(
else:
# If successfully parsed all pages in cat, flush db write cache
redpage.RedPage.flush_db_cache()
RedPage.flush_db_cache()
def treat_page( self ):
"""
@ -146,20 +146,23 @@ class DiscussionParserBot(
return
# Initiate RedPage object
red_page = redpage.RedPage( self.current_page )
redpage = RedPage.session.query(RedPage).filter(
RedPage.pageid == self.current_page.pageid ).one_or_none()
# Check whether parsing is needed
if red_page.is_parsing_needed():
if redpage:
redpage.update( self.current_page )
else:
redpage = RedPage( self.current_page )
# Check whether parsing is needed
if redpage.is_parsing_needed():
# Count families for failure analysis
fam_counter = 0
# Iterate over returned generator with redfam sections
for fam in red_page.parse():
for fam in redpage.parse():
# Run RedFamParser on section text
redfam.RedFamParser.parser( fam, red_page.page,
red_page.is_archive() )
RedFamParser.parser( fam, redpage, redpage.archive )
fam_counter += 1
@ -167,12 +170,13 @@ class DiscussionParserBot(
# If successfully parsed whole page, flush
# db write cache
if( fam_counter ):
redfam.RedFamParser.flush_db_cache()
RedFamParser.flush_db_cache()
jogobot.output( "Page [[{reddisc}]] parsed".format(
reddisc=red_page.page.title() ) )
reddisc=redpage.page.title() ) )
else:
jogobot.output(
"\03{red}" + "Page [[{reddisc}]], ".format(
reddisc=red_page.page.title() ) +
reddisc=redpage.page.title() ) +
"containing no redfam, parsed!",
"WARNING" )

500
lib/mysqlred.py

@ -25,350 +25,300 @@
Provides interface classes for communication of redundances bot with mysql-db
"""
# Prefere using oursql then MySQLdb
try:
import oursql as mysqldb
except ImportError:
import MySQLdb as mysqldb
import atexit # noqa
import atexit
import pywikibot
import pywikibot # noqa
from pywikibot import config
import jogobot
from sqlalchemy import (
create_engine, Column, Integer, String, Text, DateTime, ForeignKey )
from sqlalchemy import text # noqa
from sqlalchemy.engine.url import URL
from sqlalchemy.ext.declarative import (
declarative_base, declared_attr, has_inherited_table )
from sqlalchemy.ext.mutable import MutableComposite, MutableSet
from sqlalchemy.orm import sessionmaker, relationship, composite
from sqlalchemy.orm.collections import attribute_mapped_collection
import sqlalchemy.types as types
class MysqlRed:
"""
Basic interface class, containing opening of connection
Specific querys should be defined in descendant classes per data type
"""
Base = declarative_base()
# Save mysqldb-connection as class attribute to use only one
# in descendant classes
connection = False
db_hostname = config.db_hostname
db_port = config.db_port
db_username = config.db_username
db_password = config.db_password
db_name = config.db_username + jogobot.config['db_suffix']
db_table_prefix = False
# Class variables for storing cached querys
_cached_update_data = []
_update_query = ''
_cached_insert_data = {}
_insert_query = ''
def __init__( self ):
"""
Opens a connection to MySQL-DB
url = URL( "mysql+oursql",
username=config.db_username,
password=config.db_password,
host=config.db_hostname,
port=config.db_port,
database=config.db_username + jogobot.config['db_suffix'] )
engine = create_engine(url, echo=True)
@returns mysql-stream MySQL Connection
"""
# Needs to be generated after Parsing of Args (not at import time)
if not type(self).db_table_prefix:
type(self).db_table_prefix = \
pywikibot.Site().family.dbName(pywikibot.Site().code)
# Now we can setup prepared queries
self._prepare_queries()
Session = sessionmaker(bind=engine)
session = Session()
# Connect to mysqldb only once
if not type( self ).connection:
family = pywikibot.Site().family.dbName(pywikibot.Site().code)
type( self ).connection = mysqldb.connect(
host=type( self ).db_hostname,
port=type( self ).db_port,
user=type( self ).db_username,
passwd=type( self ).db_password,
db=type( self ).db_name )
# Register callback for warnig if exit with cached db write querys
atexit.register( type(self).warn_if_not_flushed )
class Mysql(object):
session = session
def __del__( self ):
"""
Before deleting class, close connection to MySQL-DB
"""
@declared_attr
def _tableprefix(cls):
return family + "_"
type( self ).connection.close()
@declared_attr
def _tablesuffix(cls):
return "s"
def _prepare_queries( self ):
"""
Used to replace placeholders in prepared queries
"""
type(self)._update_query = type(self)._update_query.format(
prefix=type(self).db_table_prefix)
type(self)._insert_query = type(self)._insert_query.format(
prefix=type(self).db_table_prefix)
@declared_attr
def __tablename__(cls):
if has_inherited_table(cls):
return None
name = cls.__name__[len("Mysql"):].lower()
return cls._tableprefix + name + cls._tablesuffix
@classmethod
def flush( cls ):
"""
Run cached querys
"""
if not cls.connection:
raise MysqlRedConnectionError( "No connection exists!" )
cursor = cls.connection.cursor()
# Execute insert query
if cls._cached_insert_data:
# Since cls._cached_insert_data is a dict, we need to have a custom
# Generator to iterate over it
cursor.executemany( cls._insert_query,
( cls._cached_insert_data[ key ]
for key in cls._cached_insert_data ) )
# Reset after writing
cls._cached_insert_data = {}
# Execute update query
# Use executemany since update could not be reduced to one query
if cls._cached_update_data:
cursor.executemany( cls._update_query, cls._cached_update_data )
# Reset after writing
cls._cached_update_data = []
# Commit db changes
if cls._cached_insert_data or cls._cached_update_data:
cls.connection.commit()
@classmethod
def warn_if_not_flushed(cls):
"""
Outputs a warning if there are db write querys cached and not flushed
before exiting programm!
"""
if cls._cached_update_data or cls._cached_insert_data:
jogobot.output( "Cached Database write querys not flushed!!! " +
"Data loss is possible!", "WARNING" )
def changedp(self):
return self.session.is_modified(self)
class MysqlRedPage( MysqlRed ):
class MutableSet(MutableSet):
"""
MySQL-db Interface for handling querys for RedPages
Extended version of the mutable set for our states
"""
# Class variables for storing cached querys
# '{prefix}' will be replaced during super().__init__()
_cached_update_data = []
_update_query = 'UPDATE `{prefix}_redpages` \
SET `pagetitle` = ?, `revid` = ?, `status`= ? WHERE `pageid` = ?;'
_cached_insert_data = {}
_insert_query = 'INSERT INTO `{prefix}_redpages` \
( pageid, pagetitle, revid, status ) VALUES ( ?, ?, ?, ? );'
def __init__( self, pageid ):
def has(self, item):
"""
Creates a new instance, runs __init__ of parent class
"""
super().__init__( )
Check if item is in set
self.__pageid = int( pageid )
self.data = self.get_page()
def __del__( self ):
"""
Needed to prevent descendant classes of MYSQL_RED from deleting
connection to db
@param item Item to check
"""
pass
return item in self
def get_page( self ):
def add(self, item):
"""
Retrieves a red page row from MySQL-Database for given page_id
Extended add method, which only result in changed object if there is
really an item added.
@param int pageid MediaWiki page_id for page to retrieve
@returns tuple Tuple with data for given page_id
bool FALSE if none found
@param item Item to add
"""
if item not in self:
super().add(item)
cursor = type( self ).connection.cursor(mysqldb.DictCursor)
cursor.execute(
'SELECT * FROM `{prefix}_redpages` WHERE `pageid` = ?;'.format(
prefix=type(self).db_table_prefix), ( self.__pageid, ) )
res = cursor.fetchone()
if res:
return res
else:
return False
def add_page( self, pagetitle, revid, status=0 ):
def discard(self, item):
"""
Inserts a red page row in MySQL-Database for given pageid
Wrapper for extended remove below
@param int revid MediaWiki current revid
@param str pagetitle MediaWiki new pagetitle
@param int status Page parsing status
@param item Item to discard
"""
self.remove(item)
insert_data = { self.__pageid: ( self.__pageid, pagetitle,
revid, status ) }
type( self )._cached_insert_data.update( insert_data )
# Manualy construct self.data dict
self.data = { 'pageid': self.__pageid, 'revid': revid,
'pagetitle': pagetitle, 'status': status }
def update_page( self, revid=None, pagetitle=None, status=0 ):
def remove(self, item, weak=True ):
"""
Updates the red page row in MySQL-Database for given page_id
Extended remove method, which only results in changed object if there
is really an item removed. Additionally, combine remove and discard!
@param int revid MediaWiki current rev_id
@param str pagetitle MediaWiki new page_title
@param int status Page parsing status
@param item Item to remove/discard
@param weak Set to false to use remove, else discard behavior
"""
if item in self:
if weak:
super().discard(item)
else:
super().remove(item)
if not pagetitle:
pagetitle = self.data[ 'pagetitle' ]
if not revid:
revid = self.data[ 'revid' ]
type( self )._cached_update_data.append( ( pagetitle, revid,
status, self.__pageid ) )
class MysqlRedFam( MysqlRed ):
class ColumnList( list, MutableComposite ):
"""
MySQL-db Interface for handling querys for RedFams
Combines multiple Colums into a list like object
"""
# Class variables for storing cached querys
_cached_update_data = []
_update_query = 'UPDATE `{prefix}_redfams` \
SET `redpageid` = ?, `heading` = ?, `beginning` = ?, `ending` = ?, \
`status`= ? WHERE `famhash` = ?;'
_cached_insert_data = {}
_insert_query = 'INSERT INTO `{prefix}_redfams` \
( famhash, redpageid, beginning, ending, status, heading, \
article0, article1, article2, article3, article4, article5, article6, \
article7 ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );'
def __init__( self, famhash=None ):
def __init__( self, *columns ):
"""
Creates a new instance, runs __init__ of parent class
Wrapper to the list constructor deciding whether we have initialization
with individual params per article or with an iterable.
"""
# Individual params per article (from db), first one is a str
if isinstance( columns[0], str ) or \
isinstance( columns[0], MutableSet ) or columns[0] is None:
super().__init__( columns )
# Iterable articles list
else:
super().__init__( columns[0] )
self.__famhash = famhash
super().__init__( )
def __del__( self ):
"""
Needed to prevent descendant classes of MYSQL_RED from deleting
connection to db
"""
pass
def get_fam( self, famhash ):
def __setitem__(self, key, value):
"""
Retrieves a red family row from MySQL-Database for given fam_hash
@returns dict Dictionairy with data for given fam hash
False if none found
The MutableComposite class needs to be noticed about changes in our
component. So we tweak the setitem process.
"""
self.__famhash = famhash
cursor = type( self ).connection.cursor( mysqldb.DictCursor )
# set the item
super().__setitem__( key, value)
cursor.execute(
'SELECT * FROM `{prefix}_redfams` WHERE `famhash` = ?;'.
format( prefix=type(self).db_table_prefix), ( famhash, ) )
# alert all parents to the change
self.changed()
self.data = cursor.fetchone()
def add_fam( self, articlesList, heading, redpageid,
beginning, ending=None, status=0 ):
data = [ self.__famhash, redpageid, beginning, ending,
status, heading ]
for article in articlesList:
data.append( str( article ) )
def __composite_values__(self):
"""
The Composite method needs to have this method to get the items for db.
"""
return self
while len( data ) < 14:
data.append( None )
data = tuple( data )
class Status( types.TypeDecorator ):
insert_data = { self.__famhash: data }
type( self )._cached_insert_data.update( insert_data )
impl = types.String
# Manualy construct self.data dict
data_keys = ( 'famhash', 'redpageid', 'beginning', 'ending',
'status', 'heading', 'article0', 'article1', 'article2',
'article3', 'article4', 'article5', 'article6',
'article7' )
self.data = dict( zip( data_keys, data ) )
def process_bind_param(self, value, dialect):
"""
Returns status as commaseparated string (to save in DB)
def update_fam( self, redpageid, heading, beginning, ending, status ):
@returns Raw status string
@rtype str
"""
Updates the red fam row in MySQL-Database for given fam_hash
if isinstance(value, MutableSet):
return ",".join( value )
elif isinstance(value, String ) or value is None:
return value
else:
raise TypeError(
"Value should be an instance of one of {0:s},".format(
str( [type(MutableSet()), type(String()), type(None)] ) ) +
"given value was an instance of {1:s}".format(
str(type(value))) )
@param int redpageid MediaWiki page_id
@param datetime beginning Timestamp of beginning
qparam datetime ending Timestamp of ending of
@param int status red_fam status
def process_result_value(self, value, dialect):
"""
Sets status based on comma separated list
type( self )._cached_update_data.append( ( redpageid, heading,
beginning, ending, status,
self.__famhash ) )
@param raw_status Commaseparated string of stati (from DB)
@type raw_status str
"""
if value:
return MutableSet( value.strip().split(","))
else:
return MutableSet([])
def copy(self, **kw):
return Status(self.impl.length)
class MysqlRedFam( Mysql, Base ):
famhash = Column( String(64), primary_key=True, unique=True )
__article0 = Column('article0', String(255), nullable=False )
__article1 = Column('article1', String(255), nullable=False )
__article2 = Column('article2', String(255), nullable=True )
__article3 = Column('article3', String(255), nullable=True )
__article4 = Column('article4', String(255), nullable=True )
__article5 = Column('article5', String(255), nullable=True )
__article6 = Column('article6', String(255), nullable=True )
__article7 = Column('article7', String(255), nullable=True )
__articlesList = composite(
ColumnList, __article0, __article1, __article2, __article3,
__article4, __article5, __article6, __article7 )
heading = Column( Text, nullable=False )
redpageid = Column(
Integer, ForeignKey( family + "_redpages.pageid" ), nullable=False )
beginning = Column( DateTime, nullable=False )
ending = Column( DateTime, nullable=True )
_status = Column( 'status', MutableSet.as_mutable(Status(255)),
nullable=True )
__article0_status = Column(
'article0_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article1_status = Column(
'article1_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article2_status = Column(
'article2_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article3_status = Column(
'article3_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article4_status = Column(
'article4_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article5_status = Column(
'article5_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article6_status = Column(
'article6_status', MutableSet.as_mutable(Status(64)), nullable=True )
__article7_status = Column(
'article7_status', MutableSet.as_mutable(Status(64)), nullable=True )
__articlesStatus = composite(
ColumnList, __article0_status, __article1_status, __article2_status,
__article3_status, __article4_status, __article5_status,
__article6_status, __article7_status )
redpage = relationship( "MysqlRedPage", enable_typechecks=False,
back_populates="redfams" )
@property
def articlesList(self):
"""
List of articles belonging to the redfam
"""
return self.__articlesList
@articlesList.setter
def articlesList(self, articlesList):
# Make sure to always have full length for complete overwrites
while( len(articlesList) < 8 ):
articlesList.append(None)
self.__articlesList = ColumnList(articlesList)
@property
def status( self ):
"""
Current fam status
"""
return self._status
@status.setter
def status( self, status ):
if status:
self._status = MutableSet( status )
else:
self._status = MutableSet()
def get_by_status( self, status ):
@property
def articlesStatus(self):
"""
Generator witch fetches redFams with given status from DB
List of status strings/sets for the articles of the redfam
"""
return self.__articlesStatus
cursor = type( self ).connection.cursor( mysqldb.DictCursor )
@articlesStatus.setter
def articlesStatus(self, articlesStatus):
self.__articlesStatus = ColumnList(articlesStatus)
cursor.execute(
'SELECT * FROM `{prefix}_redfams` WHERE `status` = LIKE %?%;'.
format( prefix=type( self ).db_table_prefix), ( status, ) )
while True:
res = cursor.fetchmany( 1000 )
if not res:
break
for row in res:
yield row
class MysqlRedPage( Mysql, Base ):
pageid = Column( Integer, unique=True, primary_key=True )
revid = Column( Integer, unique=True, nullable=False )
pagetitle = Column( String(255), nullable=False )
__status = Column( 'status', MutableSet.as_mutable(Status(255)),
nullable=True )
def get_by_status_and_ending( self, status, ending ):
redfams = relationship(
"MysqlRedFam", enable_typechecks=False,
back_populates="redpage", order_by=MysqlRedFam.famhash,
collection_class=attribute_mapped_collection("famhash") )
@property
def status( self ):
"""
Generator witch fetches redFams with given status from DB
Current fam status
"""
return self.__status
@status.setter
def status( self, status ):
if status:
self.__status = MutableSet( status )
else:
self.__status = MutableSet()
cursor = type( self ).connection.cursor( mysqldb.DictCursor )
cursor.execute( (
'SELECT * ' +
'FROM `{prefix}_redfams` `F` ' +
'INNER JOIN `{prefix}_redpages` `P` ' +
'ON `F`.`status` = ? ' +
'AND `F`.`ending` >= ? ' +
'AND `F`.`redpageid` = `P`.`pageid`;').format(
prefix=type( self ).db_table_prefix),
( status, ending ) )
while True:
res = cursor.fetchmany( 1000 )
if not res:
break
for row in res:
yield row
Base.metadata.create_all(engine)
class MysqlRedError(Exception):

482
lib/redfam.py

@ -3,7 +3,7 @@
#
# redfam.py
#
# Copyright 2015 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
# Copyright 2017 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -35,10 +35,10 @@ import pywikibot # noqa
from pywikibot.tools import deprecated # noqa
import jogobot
from lib.mysqlred import MysqlRedFam
from lib.mysqlred import MysqlRedFam, text
class RedFam:
class RedFam( MysqlRedFam ):
"""
Basic class for RedFams, containing the basic data structure
"""
@ -60,22 +60,16 @@ class RedFam:
# Having pywikibot.Site() is a good idea most of the time
self.site = pywikibot.Site()
# Database interface
self._mysql = MysqlRedFam( famhash )
# Initial attribute values
self._articlesList = articlesList
self._beginning = beginning
self._ending = ending
self._redpageid = redpageid
self._status = set()
self._status = self._parse_status(status)
self._famhash = famhash
self._heading = heading
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
self.calc_famhash()
super().__init__(
articlesList=articlesList,
beginning=beginning,
ending=ending,
redpageid=redpageid,
famhash=famhash,
heading=heading,
status=status,
articlesStatus=None
)
def __repr__( self ):
"""
@ -85,18 +79,20 @@ class RedFam:
"""
__repr = "RedFam( " + \
"articlesList=" + repr( self._articlesList ) + \
", heading=" + repr( self._heading ) + \
", beginning=" + repr( self._beginning ) + \
", ending=" + repr( self._ending ) + \
", red_page_id=" + repr( self._redpageid ) + \
", status=" + repr( self._status ) + \
", fam_hash=" + repr( self._famhash ) + \
"articlesList=" + repr( self.articlesList ) + \
", heading=" + repr( self.heading ) + \
", beginning=" + repr( self.beginning ) + \
", ending=" + repr( self.ending ) + \
", red_page_id=" + repr( self.redpageid ) + \
", status=" + repr( self.status ) + \
", fam_hash=" + repr( self.famhash ) + \
", articlesStatus=" + repr( self.articlesStatus ) + \
" )"
return __repr
def calc_famhash( self ):
@classmethod
def calc_famhash(cls, articlesList ):
"""
Calculates the SHA-1 hash for the articlesList of redundance family.
Since we don't need security SHA-1 is just fine.
@ -105,99 +101,21 @@ class RedFam:
"""
h = hashlib.sha1()
h.update( str( self._articlesList[:8] ).encode('utf-8') )
# Since articlesList attr of RedFam will have always 8 Members we
# need to fill up smaller lists (longers will be cropped below).
while len( articlesList) < 8:
articlesList.append(None)
if self._famhash and h.hexdigest() != self._famhash:
raise RedFamHashError( self._famhash, h.hexdigest() )
h.update( str( articlesList[:8] ).encode('utf-8') )
elif self._famhash:
return
else:
self._famhash = h.hexdigest()
def changed( self ):
"""
Checks wether anything has changed and maybe triggers db update
"""
# On archived redfams do not delete possibly existing ending
if( not self._ending and "archived" in self._status and
self._mysql.data[ 'ending' ] ):
self._ending = self._mysql.data[ 'ending' ]
# Since status change means something has changed, update database
if( self._raw_status != self._mysql.data[ 'status' ] or
self._beginning != self._mysql.data[ 'beginning' ] or
self._ending != self._mysql.data[ 'ending' ] or
self._red_page_id != self._mysql.data[ 'redpageid' ] or
self._heading != self._mysql.data[ 'heading' ]):
self._mysql.update_fam( self._redpageid, self._heading,
self._beginning, self._ending,
self._raw_status() )
return h.hexdigest()
@classmethod
def flush_db_cache( cls ):
"""
Calls flush method of Mysql Interface class
"""
MysqlRedFam.flush()
def add_status(self, status):
"""
Adds a status specified by status, to status set
@param status Statusstring to add
@type status str
"""
self._status.add(status)
def remove_status(self, status, weak=True):
"""
Removes a status, specified by status from set. If weak is set to
False it will throw a KeyError when trying to remove a status not set.
@param status Statusstring to add
@type status str
@param weak Change behavior on missing status
@type bool
"""
if weak:
self._status.discard(status)
else:
self._status.remove(status)
def has_status(self, status):
"""
Returns True, if redfam has given status
@param status Statusstring to check
@type status str
@returns True if status is present else False
"""
if status in self._status:
return True
else:
return False
def _parse_status(self, raw_status ):
"""
Sets status based on comma separated list
@param raw_status Commaseparated string of stati (from DB)
@type raw_status str
"""
self._status = set( raw_status.strip().split(","))
def _raw_status( self ):
"""
Returns status as commaseparated string (to save in DB)
@returns Raw status string
@rtype str
"""
return ",".join( self._status )
cls.session.commit()
def article_add_status(self, status, index=None, title=None ):
"""
@ -212,10 +130,10 @@ class RedFam:
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
index = self.articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
self._article_status[index].add(status)
if isinstance( index, int ) and index < len(self.articlesList):
self.articlesStatus[index].add(status)
else:
raise IndexError( "No index given or wrong format!")
@ -236,13 +154,13 @@ class RedFam:
@type bool
"""
if title and not index:
index = self._articlesList.index( title )
index = self.articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
if isinstance( index, int ) and index < len(self.articlesList):
if weak:
self._article_status[index].discard(status)
self.articlesStatus[index].discard(status)
else:
self._article_status[index].remove(status)
self.articlesStatus[index].remove(status)
else:
raise IndexError( "No index given or wrong format!")
@ -259,56 +177,16 @@ class RedFam:
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
index = self.articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
if status in self._article_status[index]:
if isinstance( index, int ) and index < len(self.articlesList):
if status in self.articlesStatus[index]:
return True
else:
return False
else:
raise IndexError( "No index given or wrong format!")
def _article_parse_status(self, raw_status, index=None, title=None ):
"""
Sets status based on comma separated list to articles (identified by
title or index in articlesList) status set
@param status Statusstring to set
@type status str
@param index Add to article with index in articlesList
@type index int
@param title Add to article with title in articlesList
@type title str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
self._article_status[index] = set( raw_status.strip().split(","))
else:
raise IndexError( "No index given or wrong format!")
def _article_raw_status( self, index=None, title=None ):
"""
Returns status as commaseparated string (to save in DB) of article
(identified by title or index in articlesList) status set
@param index Get from article with index in articlesList
@type index int
@param title Get from article with title in articlesList
@type title str
@returns Raw status string
@rtype str
"""
if title and not index:
index = self._articlesList.index( title )
if isinstance( index, int ) and index < len(self._articlesList):
return ",".join( self._article_status[index] )
else:
raise IndexError( "No index given or wrong format!")
class RedFamParser( RedFam ):
"""
@ -331,7 +209,7 @@ class RedFamParser( RedFam ):
wurde gewünscht von:"
__done_notice2 = "{{Erledigt|"
def __init__( self, heading, redpage, redpagearchive,
def __init__( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None ):
"""
Creates a RedFam object based on data collected while parsing red_pages
@ -346,57 +224,50 @@ class RedFamParser( RedFam ):
str strptime parseable string
"""
# Set object attributes:
self._redpageid = redpage._pageid
self._redpagearchive = redpagearchive
self._famhash = None
# Method self.add_beginning sets self._beginning directly
self.add_beginning( beginning )
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
famhash = type(self).calc_famhash(articlesList)
# Method self.add_ending sets self._ending directly
if( ending ):
self.add_ending( ending )
else:
# If no ending was provided set to None
self._ending = None
# Set object attributes:
self.redpage = redpage
self._status = set()
# Parse Timestamps
beginning = self.__datetime(beginning)
if ending:
ending = self.__datetime(ending)
# Parse the provided heading of redundance section
# to set self._articlesList
self.heading_parser( heading )
super().__init__( articlesList,
beginning,
ending=ending,
redpageid=redpage.page._pageid,
famhash=famhash,
heading=heading )
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
# Check status changes
self.check_status()
self.calc_famhash()
self.session.add(self)
# Open database connection, ask for data if existing,
# otherwise create entry
self.__handle_db()
def update( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None ):
# Check status changes
self.status()
self.articlesList = articlesList
self.heading = heading
self.redpage = redpage
self.redpageid = redpage.pageid
# Triggers db update if anything changed
self.changed()
self.add_beginning( beginning )
def __handle_db( self ):
"""
Handles opening of db connection
"""
if ending:
self.add_ending( ending )
# We need a connection to our mysqldb
self._mysql = MysqlRedFam( )
self._mysql.get_fam( self._famhash )
self._redpagearchive = redpagearchive
if not self._mysql.data:
self._mysql.add_fam( self._articlesList, self._heading,
self._redpageid, self._beginning,
self._ending )
# Check status changes
self.check_status()
def heading_parser( self, heading ):
@classmethod
def heading_parser( cls, heading ):
"""
Parses given red_fam_heading string and saves articles list
@ -404,34 +275,13 @@ class RedFamParser( RedFam ):
@type heading wikicode or mwparser-parseable
"""
# Save heading as string
self._heading = str( heading )
# Parse string heading with mwparse again everytime
# In some cases the given wikicode is broken due to syntax errors
# (Task FS#77)
heading = mwparser.parse( self._heading )
heading = mwparser.parse( str( heading ) )
# Save destinations of wikilinks in headings
self._articlesList = [ str( link.title ) for link
in heading.ifilter_wikilinks() ]
# Catch sections with more then 8 articles, print error
if len( self._articlesList ) > 8:
# For repression in output we need to know the fam hash
self.calc_famhash()
jogobot.output(
( "\03{{lightred}}" +
"Maximum number of articles in red_fam exceeded, " +
"maximum number is 8, {number:d} were given \n {repress}"
).format( datetime=datetime.now().strftime(
"%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
repress=repr( self ) ),
"WARNING" )
# Only save the first 8 articles
self._articlesList = self._articlesList[:8]
return [ str( link.title ) for link in heading.ifilter_wikilinks() ]
def add_beginning( self, beginning ):
"""
@ -440,7 +290,7 @@ class RedFamParser( RedFam ):
@param datetime datetime Beginning date
"""
self._beginning = self.__datetime( beginning )
self.beginning = self.__datetime( beginning )
def add_ending( self, ending ):
"""
@ -449,7 +299,7 @@ class RedFamParser( RedFam ):
@param datetime datetime Ending date
"""
self._ending = self.__datetime( ending )
self.ending = self.__datetime( ending )
def __datetime( self, timestamp ):
"""
@ -473,7 +323,7 @@ class RedFamParser( RedFam ):
type( self ).__timestamp_format )
return result
def status( self ):
def check_status( self ):
"""
Handles detection of correct status
There are three possible stati:
@ -485,16 +335,16 @@ class RedFamParser( RedFam ):
# No ending, discussion is running:
# Sometimes archived discussions also have no detectable ending
if not self._ending and not self._redpagearchive:
self.add_status("open")
if not self.ending and not self.redpage.archive:
self.status.add("open")
else:
self.remove_status("open")
if not self._redpagearchive:
self.add_status("done")
self.status.remove("open")
if not self.redpage.archive:
self.status.add("done")
else:
self.remove_status("done")
self.remove_status("open")
self.add_status("archived")
self.status.remove("done")
self.status.remove("open")
self.status.add("archived")
@classmethod
def is_section_redfam_cb( cls, heading ):
@ -513,7 +363,7 @@ class RedFamParser( RedFam ):
return False
@classmethod
def parser( cls, text, page, isarchive=False ):
def parser( cls, text, redpage, isarchive=False ):
"""
Handles parsing of redfam section
@ -526,7 +376,7 @@ class RedFamParser( RedFam ):
text = mwparser.parse( text )
# Extract heading text
heading = next( text.ifilter_headings() ).title
heading = next( text.ifilter_headings() ).title.strip()
# Extract beginnig and maybe ending
(beginning, ending) = RedFamParser.extract_dates( text, isarchive )
@ -536,16 +386,37 @@ class RedFamParser( RedFam ):
if not beginning:
match = re.search(
jogobot.config["redundances"]["reddiscs_onlyinclude_re"],
page.title() )
redpage.page.title() )
if match:
beginning = datetime.strptime(
"01. {month} {year}".format(
month=match.group(1), year=match.group(2)),
"%d. %B %Y" )
articlesList = RedFamParser.heading_parser( heading )
famhash = RedFamParser.calc_famhash( articlesList )
# Check for existing objects in DB first in current redpage
redfam = redpage.redfams.get(famhash)
# Create the RedFam object
RedFamParser( heading, page, isarchive, beginning, ending )
with RedFamParser.session.no_autoflush:
if not redfam:
# Otherwise in db table
redfam = RedFamParser.session.query(RedFamParser).filter(
RedFamParser.famhash == famhash ).one_or_none()
if redfam:
# Existing redfams need to be updated
redfam.update( articlesList, str(heading), redpage, isarchive,
beginning, ending )
else:
# Create the RedFam object
redfam = RedFamParser( articlesList, str(heading),
redpage, isarchive, beginning, ending )
# Add redfam to redpage object
redpage.redfams.set( redfam )
@classmethod
def extract_dates( cls, text, isarchive=False ):
@ -599,42 +470,16 @@ class RedFamWorker( RedFam ):
Handles working with redundance families stored in database
where discussion is finished
"""
def __init__( self, mysql_data ):
articlesList = []
for key in sorted( mysql_data.keys() ):
if 'article' in key and 'status' not in key and mysql_data[ key ]:
articlesList.append( mysql_data[ key ] )
# Preset article status list with empty sets for existing articles
self._article_status = [set() for x in range(0, len(articlesList))]
super().__init__( articlesList, mysql_data[ 'beginning' ],
mysql_data[ 'ending' ], mysql_data[ 'redpageid' ],
mysql_data[ 'status' ], mysql_data[ 'famhash' ],
mysql_data[ 'heading' ] )
def __init__( self ):
self._mysql.data = mysql_data
# Set up article status
index = 0
for article in self._articlesList:
raw_status = mysql_data[ "article" + str(index) + "_status" ]
if not raw_status:
raw_status = str()
self._article_parse_status( raw_status, index )
index += 1
# Get related RedPage-Information
self.redpageid = mysql_data[ 'pageid' ]
self.redpagetitle = mysql_data[ 'pagetitle' ]
super().__init__()
# Make sure locale is set to 'de_DE.UTF-8' to prevent problems
# with wrong month abreviations in strptime
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
def article_generator(self, filter_existing=None, filter_redirects=None,
def article_generator(self, # noqa
filter_existing=None, filter_redirects=None,
exclude_article_status=[],
onlyinclude_article_status=[] ):
"""
@ -653,8 +498,34 @@ class RedFamWorker( RedFam ):
"""
# Iterate over articles in redfam
for article in self._articlesList:
page = pywikibot.Page(pywikibot.Link(article), self.site)
for article in self.articlesList:
# Not all list elements contain articles
if not article:
break
page = pywikibot.Page(pywikibot.Link(article), pywikibot.Site())
# Filter existing pages if requested with filter_existing=False
if page.exists():
self.article_remove_status( "deleted", title=article )
if filter_existing is False:
continue
# Filter non existing Pages if requested with filter_existing=True
else:
self.article_add_status( "deleted", title=article )
if filter_existing:
continue
# Filter redirects if requested with filter_redirects=True
if page.isRedirectPage():
self.article_add_status( "redirect", title=article )
if filter_redirects:
continue
# Filter noredirects if requested with filter_redirects=False
else:
self.article_remove_status("redirect", title=article )
if filter_redirects is False:
continue
# Exclude by article status
for status in exclude_article_status:
@ -666,20 +537,6 @@ class RedFamWorker( RedFam ):
if not self.article_has_status( status, title=article ):
continue
# Filter non existing Pages if requested with filter_existing=True
if filter_existing and not page.exists():
continue
# Filter existing pages if requested with filter_existing=False
elif filter_existing is False and page.exists():
continue
# Filter redirects if requested with filter_redirects=True
if filter_redirects and page.isRedirectPage():
continue
# Filter noredirects if requested with filter_redirects=False
elif filter_redirects is False and not page.isRedirectPage():
continue
# Yield filtered pages
yield page
@ -687,24 +544,18 @@ class RedFamWorker( RedFam ):
"""
Sets status to 3 when worked on
"""
for article in self._articlesList:
for article in self.articlesList:
if not article:
break
if self.article_has_status( "note_rej", title=article ):
self.add_status( "note_rej" )
self.status.add( "note_rej" )
if self.article_has_status( "sav_err", title=article ):
self.add_status( "sav_err" )
if not self.has_status( "sav_err" ) and \
not self.has_status( "note_rej" ):
self.add_status( "marked" )
self.status.add( "sav_err" )
self._mysql.data[ 'status' ] = self._raw_status()
index = 0
for article in self._articlesList:
self._mysql.data[ "article" + str(index) + 'status' ] = \
self._article_raw_status( index=index )
index += 1
print( repr(self) )
if not self.status.has( "sav_err" ) and \
not self.status.has( "note_rej" ):
self.status.add( "marked" )
def get_disc_link( self ):
"""
@ -715,7 +566,7 @@ class RedFamWorker( RedFam ):
"""
# We need to Replace Links with their linktext
anchor_code = mwparser.parse( self._mysql.data[ 'heading' ].strip() )
anchor_code = mwparser.parse( self.heading.strip() )
for link in anchor_code.ifilter_wikilinks():
if link.text:
text = link.text
@ -728,7 +579,7 @@ class RedFamWorker( RedFam ):
anchor_code.replace( " ", "_" )
# We try it with out any more parsing as mw will do while parsing page
return ( self.redpagetitle + "#" +
return ( self.redpage.pagetitle + "#" +
str(anchor_code).strip() )
def generate_disc_notice_template( self ):
@ -748,7 +599,9 @@ class RedFamWorker( RedFam ):
param_cnt = 3
# Iterate over articles in redfam
for article in self._articlesList:
for article in self.articlesList:
if not article:
break
# Make sure to only use 8 articles (max. param 10)
if param_cnt > 10:
break
@ -759,11 +612,11 @@ class RedFamWorker( RedFam ):
param_cnt += 1
# Add begin
begin = self._mysql.data[ 'beginning' ].strftime( "%B %Y" )
begin = self.beginning.strftime( "%B %Y" )
template.add( "Beginn", begin, True )
# Add end (if not same as begin)
end = self._mysql.data[ 'ending' ].strftime( "%B %Y" )
end = self.ending.strftime( "%B %Y" )
if not end == begin:
template.add( "Ende", end, True )
@ -795,13 +648,14 @@ class RedFamWorker( RedFam ):
Yield red_fams stored in db by given status which have an ending after
given one
"""
mysql = MysqlRedFam()
for fam in mysql.get_by_status_and_ending( status, ending ):
try:
yield cls( fam )
except RedFamHashError:
print(fam)
raise
for redfam in RedFamWorker.session.query(RedFamWorker).filter(
# NOT WORKING WITH OBJECT NOTATION
# RedFamWorker._status.like('archived'),
# RedFamWorker._status.like("%{0:s}%".format(status)),
text("status LIKE '%archived%'"),
RedFamWorker.ending >= ending ):
yield redfam
class RedFamError( Exception ):

151
lib/redpage.py

@ -34,7 +34,7 @@ from lib.mysqlred import MysqlRedPage
from lib.redfam import RedFamParser
class RedPage:
class RedPage( MysqlRedPage ):
"""
Class for handling redundance discussion pages and archives
"""
@ -49,73 +49,55 @@ class RedPage:
@type pageid int
"""
self._status = set()
# Safe the pywikibot page object
self.page = page
self.pageid = pageid
self._archive = archive
if page:
self._page = page
self.__handle_db( )
self.is_page_changed()
super().__init__(
pageid=self._page.pageid,
revid=self._page._revid,
pagetitle=self._page.title(),
status=None
)
self._parsed = None
self.is_archive()
def __handle_db( self ):
"""
Handles opening of db connection
"""
self.session.add(self)
# We need a connection to our mysqldb
if self.page:
self.__mysql = MysqlRedPage( self.page._pageid )
self.pageid = self.page._pageid
elif self.pageid:
self.__mysql = MysqlRedPage( self.pageid )
self.page = pywikibot.Page( pywikibot.Site(),
self.__mysql.data['pagetitle'] )
self.page.exists()
else:
raise ValueError( "Page NOR pagid provided!" )
def update( self, page ):
self._page = page
self.revid = page._revid
self.pagetitle = page.title()
self.is_archive()
if not self.__mysql.data:
self.__mysql.add_page( self.page.title(), self.page._revid )
@property
def page(self):
if not hasattr(self, "_page"):
self._page = pywikibot.Page( pywikibot.Site(), self.pagetitle )
def is_page_changed( self ):
"""
Check wether the page was changed since last run
"""
return self._page
if( self.__mysql.data != { 'pageid': self.page._pageid,
'revid': self.page._revid,
'pagetitle': self.page.title(),
'status': self.__mysql.data[ 'status' ] } ):
self._changed = True
else:
self._changed = False
@property
def archive(self):
self.is_archive()
return self.status.has("archive")
def is_archive( self ):
"""
Detects wether current page is an archive of discussions
"""
if( self._archive or ( u"/Archiv" in self.page.title() ) or
if( ( u"/Archiv" in self.page.title() ) or
( "{{Archiv}}" in self.page.text ) or
( "{{Archiv|" in self.page.text ) ):
return True
self.status.add("archive")
else:
return False
self.status.discard("archive")
def is_parsing_needed( self ):
"""
Decides wether current RedPage needs to be parsed or not
"""
if( self._changed or self.__mysql.data[ 'status' ] == "" ):
return True
else:
return False
return self.changedp() or not self.status.has("parsed")
def parse( self ):
"""
@ -140,83 +122,12 @@ class RedPage:
yield fam
else:
self.status.add("parsed")
self._parsed = True
self.__update_db()
def __update_db( self ):
"""
Updates the page meta data in mysql db
"""
if( self._parsed or not self._changed ):
self.add_status( "open" )
if( self.is_archive() ):
self.remove_status( "open" )
self.add_status( "archived" )
else:
self._status = set()
self.__mysql.update_page( self.page._revid, self.page.title(),
self._raw_status() )
@classmethod
def flush_db_cache( cls ):
"""
Calls flush method of Mysql Interface class
"""
MysqlRedPage.flush()
def add_status(self, status):
"""
Adds a status specified by status, to status set
@param status Statusstring to add
@type status str
"""
self._status.add(status)
def remove_status(self, status, weak=True):
"""
Removes a status, specified by status from set. If weak is set to
False it will throw a KeyError when trying to remove a status not set.
@param status Statusstring to add
@type status str
@param weak Change behavior on missing status
@type bool
"""
if weak:
self._status.discard(status)
else:
self._status.remove(status)
def has_status(self, status):
"""
Returns True, if redfam has given status
@param status Statusstring to check
@type status str
@returns True if status is present else False
"""
if status in self._status:
return True
else:
return False
def _parse_status(self, raw_status ):
"""
Sets status based on comma separated list
@param raw_status Commaseparated string of stati (from DB)
@type raw_status str
"""
self._status = set( raw_status.strip().split(","))
def _raw_status( self ):
"""
Returns status as commaseparated string (to save in DB)
@returns Raw status string
@rtype str
"""
return ",".join( self._status )
cls.session.commit()

Loading…
Cancel
Save