sqlalchemy working for parser

Needs some testing, presumably contains some bugs
This commit is contained in:
2016-11-26 22:26:55 +01:00
parent 0ebf307bb8
commit 6e973369cd
4 changed files with 857 additions and 516 deletions

View File

@@ -3,7 +3,7 @@
#
# redfam.py
#
# Copyright 2015 GOLDERWEB Jonathan Golder <jonathan@golderweb.de>
# Copyright 2017 GOLDERWEB Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -35,16 +35,17 @@ import pywikibot # noqa
from pywikibot.tools import deprecated # noqa
import jogobot
from lib.mysqlred import MysqlRedFam
#~ from lib.mysqlred import Column, Integer, String, Text, DateTime, ForeignKey, ColumnList, Status
from lib.mysqlred import MysqlRedFam, MutableSet, ColumnList #, Mysql, Base, relationship, composite,
class RedFam:
class RedFam( MysqlRedFam ):
"""
Basic class for RedFams, containing the basic data structure
"""
def __init__( self, articlesList, beginning, ending=None, redpageid=None,
status=None, famhash=None, heading=None ):
status=MutableSet(), famhash=None, heading=None ):
"""
Generates a new RedFam object
@@ -61,21 +62,32 @@ class RedFam:
self.site = pywikibot.Site()
# Database interface
self._mysql = MysqlRedFam( famhash )
#self._mysql = MysqlRedFam( famhash )
# Initial attribute values
self._articlesList = articlesList
self._beginning = beginning
self._ending = ending
self._redpageid = redpageid
self._status = set()
self._status = self._parse_status(status)
self._famhash = famhash
self._heading = heading
#~ self.articlesList = articlesList
#~ self.beginning = beginning
#~ self.ending = ending
#~ self.redpageid = redpageid
#~ # self._status = set()
#~ # self._status = self._parse_status(status)
#~ self.famhash = famhash
#~ self.heading = heading
#self.status = status
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
self.calc_famhash()
#articlesStatus = ColumnList([ MutableSet() for x in range(0,8) ])
#~ # Calculates the sha1 hash over self._articlesList to
#~ # rediscover known redundance families
#~ self.calc_famhash()
#~ if not status:
#~ status = MutableSet()
super().__init__( articlesList=articlesList, beginning=beginning, ending=ending, redpageid=redpageid,
famhash=famhash, heading=heading, status=status, articlesStatus=None )
#super().__init__()
def __repr__( self ):
"""
@@ -85,64 +97,75 @@ class RedFam:
"""
__repr = "RedFam( " + \
"articlesList=" + repr( self._articlesList ) + \
", heading=" + repr( self._heading ) + \
", beginning=" + repr( self._beginning ) + \
", ending=" + repr( self._ending ) + \
", red_page_id=" + repr( self._redpageid ) + \
", status=" + repr( self._status ) + \
", fam_hash=" + repr( self._famhash ) + \
"articlesList=" + repr( self.articlesList ) + \
", heading=" + repr( self.heading ) + \
", beginning=" + repr( self.beginning ) + \
", ending=" + repr( self.ending ) + \
", red_page_id=" + repr( self.redpageid ) + \
", status=" + repr( self.status ) + \
", fam_hash=" + repr( self.famhash ) + \
" )"
return __repr
def calc_famhash( self ):
@classmethod
def calc_famhash(cls, articlesList ):
h = hashlib.sha1()
# Since articlesList attr of RedFam will have always 8 Members we
# need to fill up smaller lists (longers will be cropped below).
while len( articlesList) < 8:
articlesList.append(None)
h.update( str( articlesList[:8] ).encode('utf-8') )
return h.hexdigest()
def c_famhash( self ):
"""
Calculates the SHA-1 hash for the articlesList of redundance family.
Since we don't need security SHA-1 is just fine.
@returns str String with the hexadecimal hash digest
"""
print( type( self ) )
h = hashlib.sha1()
h.update( str( self._articlesList[:8] ).encode('utf-8') )
if self._famhash and h.hexdigest() != self._famhash:
raise RedFamHashError( self._famhash, h.hexdigest() )
elif self._famhash:
if self.famhash and type(self).calc_famhash(self.articlesList) != self.famhash:
raise RedFamHashError( self.famhash, h.hexdigest() )
elif self.famhash:
return
else:
self._famhash = h.hexdigest()
self.famhash = type(self).calc_famhash(self.articlesList)
def changed( self ):
"""
Checks wether anything has changed and maybe triggers db update
"""
#~ def changed( self ):
#~ """
#~ Checks wether anything has changed and maybe triggers db update
#~ """
# On archived redfams do not delete possibly existing ending
if( not self._ending and "archived" in self._status and
self._mysql.data[ 'ending' ] ):
#~ # On archived redfams do not delete possibly existing ending
#~ if( not self.ending and "archived" in self._status and
#~ self._mysql.data[ 'ending' ] ):
self._ending = self._mysql.data[ 'ending' ]
#~ self._ending = self._mysql.data[ 'ending' ]
# Since status change means something has changed, update database
if( self._raw_status != self._mysql.data[ 'status' ] or
self._beginning != self._mysql.data[ 'beginning' ] or
self._ending != self._mysql.data[ 'ending' ] or
self._red_page_id != self._mysql.data[ 'redpageid' ] or
self._heading != self._mysql.data[ 'heading' ]):
#~ # Since status change means something has changed, update database
#~ if( self._raw_status != self._mysql.data[ 'status' ] or
#~ self._beginning != self._mysql.data[ 'beginning' ] or
#~ self._ending != self._mysql.data[ 'ending' ] or
#~ self._red_page_id != self._mysql.data[ 'redpageid' ] or
#~ self._heading != self._mysql.data[ 'heading' ]):
self._mysql.update_fam( self._redpageid, self._heading,
self._beginning, self._ending,
self._raw_status() )
#~ self._mysql.update_fam( self._redpageid, self._heading,
#~ self._beginning, self._ending,
#~ self._raw_status() )
@classmethod
def flush_db_cache( cls ):
"""
Calls flush method of Mysql Interface class
"""
MysqlRedFam.flush()
cls.session.commit()
#~ MysqlRedFam.flush()
def add_status(self, status):
"""
@@ -151,7 +174,7 @@ class RedFam:
@param status Statusstring to add
@type status str
"""
self._status.add(status)
self.status.add(status)
def remove_status(self, status, weak=True):
"""
@@ -164,9 +187,9 @@ class RedFam:
@type bool
"""
if weak:
self._status.discard(status)
self.status.discard(status)
else:
self._status.remove(status)
self.status.remove(status)
def has_status(self, status):
"""
@@ -176,28 +199,28 @@ class RedFam:
@type status str
@returns True if status is present else False
"""
if status in self._status:
if status in self.status:
return True
else:
return False
def _parse_status(self, raw_status ):
"""
Sets status based on comma separated list
#~ def _parse_status(self, raw_status ):
#~ """
#~ Sets status based on comma separated list
@param raw_status Commaseparated string of stati (from DB)
@type raw_status str
"""
self._status = set( raw_status.strip().split(","))
#~ @param raw_status Commaseparated string of stati (from DB)
#~ @type raw_status str
#~ """
#~ self._status = set( raw_status.strip().split(","))
def _raw_status( self ):
"""
Returns status as commaseparated string (to save in DB)
#~ def _raw_status( self ):
#~ """
#~ Returns status as commaseparated string (to save in DB)
@returns Raw status string
@rtype str
"""
return ",".join( self._status )
#~ @returns Raw status string
#~ @rtype str
#~ """
#~ return ",".join( self._status )
def article_add_status(self, status, index=None, title=None ):
"""
@@ -331,7 +354,7 @@ class RedFamParser( RedFam ):
wurde gewünscht von:"
__done_notice2 = "{{Erledigt|"
def __init__( self, heading, redpage, redpagearchive,
def __init__( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None ):
"""
Creates a RedFam object based on data collected while parsing red_pages
@@ -346,57 +369,111 @@ class RedFamParser( RedFam ):
str strptime parseable string
"""
# Set object attributes:
self._redpageid = redpage._pageid
self._redpagearchive = redpagearchive
self._famhash = None
# Method self.add_beginning sets self._beginning directly
self.add_beginning( beginning )
# Method self.add_ending sets self._ending directly
if( ending ):
self.add_ending( ending )
else:
# If no ending was provided set to None
self._ending = None
self._status = set()
# Parse the provided heading of redundance section
# to set self._articlesList
self.heading_parser( heading )
#~ self.heading = str(heading)
#~ self.articlesList = articlesList
#~ # Catch sections with more then 8 articles, print error
#~ if len( self.articlesList ) > 8:
#~ # For repression in output we need to know the fam hash
#~ self.calc_famhash()
#~ jogobot.output(
#~ ( "\03{{lightred}}" +
#~ "Maximum number of articles in red_fam exceeded, " +
#~ "maximum number is 8, {number:d} were given \n {repress}"
#~ ).format( datetime=datetime.now().strftime(
#~ "%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
#~ repress=repr( self ) ),
#~ "WARNING" )
#~ # Only save the first 8 articles
#~ # self.articlesList = self.articlesList[:8]
# Calculates the sha1 hash over self._articlesList to
# rediscover known redundance families
famhash = type(self).calc_famhash(articlesList)
self.calc_famhash()
#~ obj = self.session.query(RedFamParser).filter(RedFamParser.famhash == self.famhash ).one_or_none()
#~ if obj:
#~ self = obj
# Open database connection, ask for data if existing,
# otherwise create entry
self.__handle_db()
# Set object attributes:
#~ self.redpageid = redpage._pageid
self._redpagearchive = redpagearchive
# self.famhash = None
# Method self.add_beginning sets self._beginning directly
#~ self.add_beginning( beginning )
#~ # Method self.add_ending sets self._ending directly
#~ if( ending ):
#~ self.add_ending( ending )
#~ else:
#~ # If no ending was provided set to None
#~ self.ending = None
#~ self.status = MutableSet()
beginning = self.__datetime(beginning)
if ending:
ending = self.__datetime(ending)
super().__init__( articlesList, beginning, ending=ending, redpageid=redpage._pageid,
famhash=famhash, heading=heading )
# Check status changes
self.status()
self.check_status()
self.session.add(self)
# Open database connection, ask for data if existing,
# otherwise create entry
# self.__handle_db()
# Triggers db update if anything changed
self.changed()
# self.changed()
def __handle_db( self ):
"""
Handles opening of db connection
"""
# We need a connection to our mysqldb
self._mysql = MysqlRedFam( )
self._mysql.get_fam( self._famhash )
if not self._mysql.data:
self._mysql.add_fam( self._articlesList, self._heading,
self._redpageid, self._beginning,
self._ending )
#~ def __handle_db( self ):
#~ """
#~ Handles opening of db connection
#~ """
def heading_parser( self, heading ):
#~ # We need a connection to our mysqldb
#~ self._mysql = MysqlRedFam( )
#~ self._mysql.get_fam( self._famhash )
#~ if not self._mysql.data:
#~ self._mysql.add_fam( self._articlesList, self._heading,
#~ self._redpageid, self._beginning,
#~ self._ending )
def update( self, articlesList, heading, redpage, redpagearchive,
beginning, ending=None):
self.articlesList = articlesList;
self.heading = heading;
self.redpage = redpage;
self.redpageid = redpage.pageid;
self.add_beginning( beginning )
if( ending ):
self.add_ending( ending )
self._redpagearchive = redpagearchive
# Check status changes
self.check_status()
@classmethod
def heading_parser( cls, heading ):
"""
Parses given red_fam_heading string and saves articles list
@@ -404,34 +481,16 @@ class RedFamParser( RedFam ):
@type heading wikicode or mwparser-parseable
"""
# Save heading as string
self._heading = str( heading )
# Parse string heading with mwparse again everytime
# In some cases the given wikicode is broken due to syntax errors
# (Task FS#77)
heading = mwparser.parse( self._heading )
heading = mwparser.parse( str( heading ) )
# Save destinations of wikilinks in headings
self._articlesList = [ str( link.title ) for link
return [ str( link.title ) for link
in heading.ifilter_wikilinks() ]
# Catch sections with more then 8 articles, print error
if len( self._articlesList ) > 8:
# For repression in output we need to know the fam hash
self.calc_famhash()
jogobot.output(
( "\03{{lightred}}" +
"Maximum number of articles in red_fam exceeded, " +
"maximum number is 8, {number:d} were given \n {repress}"
).format( datetime=datetime.now().strftime(
"%Y-%m-%d %H:%M:%S" ), number=len( self._articlesList ),
repress=repr( self ) ),
"WARNING" )
# Only save the first 8 articles
self._articlesList = self._articlesList[:8]
def add_beginning( self, beginning ):
"""
@@ -440,7 +499,7 @@ class RedFamParser( RedFam ):
@param datetime datetime Beginning date
"""
self._beginning = self.__datetime( beginning )
self.beginning = self.__datetime( beginning )
def add_ending( self, ending ):
"""
@@ -449,7 +508,7 @@ class RedFamParser( RedFam ):
@param datetime datetime Ending date
"""
self._ending = self.__datetime( ending )
self.ending = self.__datetime( ending )
def __datetime( self, timestamp ):
"""
@@ -473,7 +532,7 @@ class RedFamParser( RedFam ):
type( self ).__timestamp_format )
return result
def status( self ):
def check_status( self ):
"""
Handles detection of correct status
There are three possible stati:
@@ -485,7 +544,7 @@ class RedFamParser( RedFam ):
# No ending, discussion is running:
# Sometimes archived discussions also have no detectable ending
if not self._ending and not self._redpagearchive:
if not self.ending and not self._redpagearchive:
self.add_status("open")
else:
self.remove_status("open")
@@ -513,7 +572,7 @@ class RedFamParser( RedFam ):
return False
@classmethod
def parser( cls, text, page, isarchive=False ):
def parser( cls, text, redpage, isarchive=False ):
"""
Handles parsing of redfam section
@@ -536,16 +595,33 @@ class RedFamParser( RedFam ):
if not beginning:
match = re.search(
jogobot.config["redundances"]["reddiscs_onlyinclude_re"],
page.title() )
redpage.page.title() )
if match:
beginning = datetime.strptime(
"01. {month} {year}".format(
month=match.group(1), year=match.group(2)),
"%d. %B %Y" )
articlesList = RedFamParser.heading_parser( heading )
famhash = RedFamParser.calc_famhash( articlesList )
# Create the RedFam object
RedFamParser( heading, page, isarchive, beginning, ending )
# Check for existing objects in DB first in current redpage
redfam = redpage.redfams.get(famhash)
with RedFamParser.session.no_autoflush:
if not redfam:
# Otherwise in db table
redfam = RedFamParser.session.query(RedFamParser).filter(
RedFamParser.famhash == famhash ).one_or_none()
if redfam:
# Existing redfams need to be updated
redfam.update( articlesList, str(heading), redpage, isarchive, beginning, ending )
else:
# Create the RedFam object
redfam = RedFamParser( articlesList, str(heading).strip(), redpage.page, isarchive, beginning, ending )
return redfam
@classmethod
def extract_dates( cls, text, isarchive=False ):
@@ -615,16 +691,16 @@ class RedFamWorker( RedFam ):
mysql_data[ 'status' ], mysql_data[ 'famhash' ],
mysql_data[ 'heading' ] )
self._mysql.data = mysql_data
# #~ self._mysql.data = mysql_data
# Set up article status
index = 0
for article in self._articlesList:
raw_status = mysql_data[ "article" + str(index) + "_status" ]
if not raw_status:
raw_status = str()
self._article_parse_status( raw_status, index )
index += 1
#~ # Set up article status
#~ index = 0
#~ for article in self.articlesList:
#~ raw_status = mysql_data[ "article" + str(index) + "_status" ]
#~ if not raw_status:
#~ raw_status = str()
#~ self._article_parse_status( raw_status, index )
#~ index += 1
# Get related RedPage-Information
self.redpageid = mysql_data[ 'pageid' ]