You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

579 lines
18 KiB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# countrylist.py
#
# Copyright 2015 GOLDERWEB – Jonathan Golder <jonathan@golderweb.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
"""
Provides a class for handling charts list per country and year
"""
import re
import locale
from datetime import datetime
from isoweek import Week
import pywikibot
import mwparserfromhell as mwparser
class CountryList():
"""
Handles charts list per country and year
"""
def __init__( self, wikilink ):
"""
Generate new instance of class
Checks wether page given with country_list_link exists
@param wikilink Wikilink object by mwparser linking CountryList
@returns self Object representing CountryList
False if page does not exists
"""
# Generate pywikibot site object
# @TODO: Maybe store it outside???
self.site = pywikibot.Site()
# Set locale to 'de_DE.UTF-8'
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
# Generate pywikibot page object
self.page = pywikibot.Page( self.site, wikilink.title )
# Store given wikilink for page object
self.wikilink = wikilink
# Check if page exits
if not self.page.exists():
raise CountryListError( "CountryList " +
str(wikilink.title) + " does not exists!" )
# Initialise attributes
__attr = ( "wikicode", "entry", "chartein", "_chartein_raw",
"_titel_raw", "titel", "interpret", "_interpret_raw" )
for attr in __attr:
setattr( self, attr, None )
self.parsed = False
# Try to find year
self.find_year()
def is_parsing_needed( self, revid ):
"""
Check if current revid of CountryList differs from given one
@param int Revid to check against
@return True Given revid differs from current revid
False Given revid is equal to current revid
"""
if revid != self.page.latest_revision_id:
return True
else:
return False
def find_year( self ):
"""
Try to find the year related to CountryList using regex
"""
match = re.search( r"^.+\((\d{4})\)", self.page.title() )
# We matched something
if match:
self.year = int(match.group(1))
else:
raise CountryListError( "CountryList year is errorneous!" )
def parse( self ):
"""
Handles the parsing process
"""
# Set revid
self.revid = self.page.latest_revision_id
# Parse page with mwparser
self.generate_wikicode()
# Select lastest entry
self.get_latest_entry()
# Prepare chartein, titel, interpret
self.prepare_chartein()
self.prepare_titel()
self.prepare_interpret()
# For easy detecting wether we have parsed self
self.parsed = True
def detect_belgian( self ):
"""
Detect wether current entry is on of the belgian (Belgien/Wallonien)
"""
# Check if begian province name is in link text or title
if( "Wallonien" in str( self.wikilink.text ) or
"Wallonien" in str( self.wikilink.title) ):
return "Wallonie"
elif( "Flandern" in str( self.wikilink.text ) or
"Flandern" in str( self.wikilink.title) ):
return "Flandern"
else:
return None
def generate_wikicode( self ):
"""
Runs mwparser on page.text to get mwparser.objects
"""
self.wikicode = mwparser.parse( self.page.text )
def get_latest_entry( self ):
"""
Get latest list entry template object
"""
# Select the section "Singles"
# For belgian list we need to select subsection of country
belgian = self.detect_belgian()
# Select Singles-Section
# Catch Error if we have none
try:
if belgian:
singles_section = self.wikicode.get_sections(
matches=belgian )[0].get_sections( matches="Singles" )[0]
else:
singles_section = self.wikicode.get_sections(
matches="Singles" )[0]
except IndexError:
raise CountryListError( "No Singles-Section found!")
# Since we have multiple categories in some countrys we need
# to select the first wrapping template
try:
wrapping = next( singles_section.ifilter_templates(
matches="Nummer-eins-Hits" ) )
except StopIteration:
raise CountryListError( "Wrapping template is missing!")
# Select the last occurence of template "Nummer-eins-Hits Zeile" in
# Wrapper-template
for self.entry in wrapping.get("Inhalt").value.ifilter_templates(
matches="Nummer-eins-Hits Zeile" ):
pass
# Check if we have found something
if not self.entry:
raise CountryListError( self.page.title() )
def get_year_correction( self ):
"""
Reads value of jahr parameter for correcting week numbers near to
year changes
"""
# If param is present return correction, otherwise null
if self.entry.has( "Jahr" ):
# Read value of param
jahr = self.entry.get( "Jahr" ).strip()
if jahr == "+1":
return 1
elif jahr == "-1":
return -1
# None or wrong parameter value
return 0
def prepare_chartein( self ):
"""
Checks wether self._chartein_raw is a date or a week number and
calculates related datetime object
"""
# If self._chartein_raw is not set, get it
if not self._chartein_raw:
self.get_chartein_value()
# Detect weather we have a date or a weeknumber for Template Param
# "Chartein"
# Numeric string means week number
if( self._chartein_raw.isnumeric() ):
# Calculate date of monday in given week and add number of
# days given in Template parameter "Korrektur" with monday
# as day (zero)
self.chartein = ( Week( self.year + self.get_year_correction(),
int( self._chartein_raw ) ).monday() )
# Complete date string present
else:
self.chartein = datetime.strptime( self._chartein_raw,
"%Y-%m-%d" )
def get_chartein_value( self ):
"""
Reads value of chartein parameter
If param is not present raise Error
"""
if self.entry.has( "Chartein" ):
self._chartein_raw = self.entry.get("Chartein").value
# Remove possible ref-tags
for ref in self._chartein_raw.ifilter_tags(matches="ref"):
self._chartein_raw.remove( ref )
# Remove whitespace
self._chartein_raw = str(self._chartein_raw).strip()
else:
raise CountryListEntryError( "Template Parameter 'Chartein' is \
missing!" )
def prepare_titel( self ):
"""
Loads and prepares Titel of latest entry
"""
# If self._titel_raw is not set, get it
if not self._titel_raw:
self.get_titel_value()
# Try to find a wikilink for Titel on countrylist
if "[[" not in self._titel_raw:
self.titel = self._search_links( str(self._titel_raw) )
else:
self.titel = self._titel_raw
def get_titel_value( self ):
"""
Reads value of Titel parameter
If param is not present raise Error
"""
if self.entry.has( "Titel" ):
self._titel_raw = self.entry.get("Titel").value
# Remove possible ref-tags
for ref in self._titel_raw.ifilter_tags(matches="ref"):
self._titel_raw.remove( ref )
# Remove whitespace
self._titel_raw = str(self._titel_raw).strip()
else:
raise CountryListEntryError( "Template Parameter 'Titel' is \
missing!" )
def prepare_interpret( self ):
"""
Loads and prepares Interpret of latest entry
"""
# If self._interpret_raw is not set, get it
if not self._interpret_raw:
self.get_interpret_value()
# Work with interpret value to add missing links
# Split it in words
words = self._interpret_raw.split()
# Interpret name separating words
seps = ( "feat.", "&" )
# Create empty list for concatenated interpret names
parts = [ " ", ]
# Another list for managing indexes which need to be worked on
indexes = list()
index = 0
# Reconcatenate interpret names
for word in words:
# Name parts
if word not in seps:
parts[-1] += (" " + word)
# Remove unnecessary whitespace
parts[-1] = parts[-1].strip()
# We only need to work on it, if no wikilink is present
if index not in indexes and "[[" not in parts[-1]:
indexes.append( index )
else:
# Count up index 2 times ( Separator + next Name )
index += 2
parts.append( word )
parts.append( " " )
# If we have indexes without links, search for links
if indexes:
parts = self._search_links( parts, indexes )
# Join the collected links
sep = " "
self.interpret = sep.join( parts )
# Nothing to do, just use raw
else:
self.interpret = self._interpret_raw
def get_interpret_value( self ):
"""
Reads value of Interpret parameter
If param is not present raise Error
"""
if self.entry.has( "Interpret" ):
self._interpret_raw = self.entry.get("Interpret").value
# Remove possible ref-tags
for ref in self._interpret_raw.ifilter_tags(matches="ref"):
self._interpret_raw.remove( ref )
# Handle SortKeyName and SortKey
for template in self._interpret_raw.ifilter_templates(
matches="SortKey" ):
if template.name == "SortKeyName":
# Differing Link-Destination is provided as param 3
if template.has(3):
# Construct link out of Template, Params:
# 1 = Surname
# 2 = Name
# 3 = Link-Dest
interpret_link = mwparser.nodes.wikilink.Wikilink(
str(template.get(3).value),
str(template.get(1).value) + " " +
str(template.get(2).value) )
# Default Link-Dest [[Surname Name]]
else:
interpret_link = mwparser.nodes.wikilink.Wikilink(
str(template.get(1).value) + " " +
str(template.get(2).value) )
# Replace Template with link
self._interpret_raw.replace( template, interpret_link )
# SortKey
else:
# Replace SortKey with text from param 2 if present
if template.has(2):
self._interpret_raw.replace( template,
template.get(2).value)
# Else Remove SortKey (text should follow behind SortKey)
else:
self._interpret_raw.replace( template, None)
# Normally won't be needed as there should be only one
# SortKey-Temlate but ... its a wiki
break
# Remove whitespace
self._interpret_raw = str(self._interpret_raw).strip()
else:
raise CountryListEntryError( "Template Parameter 'Interpret' is \
missing!" )
def _search_links( self, keywords, indexes=None ):
"""
Search matching wikilinks for keyword(s) in CountryList's wikicode
@param keywords: One or more keywords to search for
@type keywords: str, list
@param indexes: List with numeric indexes for items of keywords to work
on only
@type indexes: list of ints
@return: List or String with replaced keywords
@return type: str, list
"""
# Maybe convert keywords string to list
if( isinstance( keywords, str ) ):
keywords = [ keywords, ]
string = True
else:
string = False
# If indexes worklist was not provided, work on all elements
if not indexes:
indexes = list(range( len( keywords ) ))
# Iterate over wikilinks of refpage and try to find related links
for wikilink in self.wikicode.ifilter_wikilinks():
# Iterate over interpret names
for index in indexes:
# Check wether wikilink matches
if( keywords[index] == wikilink.text or
keywords[index] == wikilink.title ):
# Overwrite name with complete wikilink
keywords[index] = str( wikilink )
# Remove index from worklist
indexes.remove( index )
# Other indexes won't also match
break
# If worklist is empty, stop iterating over wikilinks
if not indexes:
break
# Choose wether return list or string based on input type
if not string:
return keywords
else:
return str(keywords[0])
def __str__( self ):
"""
Returns str repression for Object
"""
if self.parsed:
return ("CountryList( Link = \"{link}\", Revid = \"{revid}\", " +
"Interpret = \"{interpret}\", Titel = \"{titel}\", " +
"Chartein = \"{chartein}\" )").format(
link=repr(self.wikilink),
revid=self.revid,
interpret=self.interpret,
titel=self.titel,
chartein=repr(self.chartein))
else:
return "CountryList( Link = \"{link}\" )".format(
link=repr(self.wikilink))
class CountryListError( Exception ):
"""
Handles errors occuring in class CountryList
"""
pass
class CountryListEntryError( CountryListError ):
"""
Handles errors occuring in class CountryList related to entrys
"""
pass
class CountryListUnitTest():
"""
Defines Test-Functions for CountryList-Module
"""
testcases = ( { "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Frankreich (2015)" ), # noqa
"revid": 148453827,
"interpret": "[[Adele (Sängerin)|Adele]]",
"titel": "[[Hello (Adele-Lied)|Hello]]",
"chartein": datetime( 2015, 10, 23 ) },
{ "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Belgien (2015)", "Wallonien"), # noqa
"revid": 148455281,
"interpret": "[[Nicky Jam]] & [[Enrique Iglesias (Sänger)|Enrique Iglesias]]", # noqa
"titel": "El perdón",
"chartein": datetime( 2015, 9, 12 ) } )
def __init__( self, page=None ):
"""
Constructor
Set attribute page
"""
if page:
self.page_link = mwparser.nodes.Wikilink( page )
else:
self.page_link = None
def treat( self ):
"""
Start testing either manually with page provided by cmd-arg page or
automatically with predefined test case
"""
if self.page_link:
self.man_test()
else:
self.auto_test()
def auto_test( self ):
"""
Run automatic tests with predefined test data from wiki
"""
for case in type(self).testcases:
self.countrylist = CountryList( case["Link"] )
if( self.countrylist.is_parsing_needed( case["revid"] ) or not
self.countrylist.is_parsing_needed( case["revid"] + 1 ) ):
raise Exception(
"CountryList.is_parsing_needed() does not work!" )
self.countrylist.parse()
for key in case:
if key == "Link":
continue
if not case[key] == getattr(self.countrylist, key ):
raise Exception( key + "" + str(
getattr(self.countrylist, key ) ))
def man_test( self ):
"""
Run manual test with page given in parameter
"""
self.countrylist = CountryList( self.page_link )
self.countrylist.parse()
print( self.countrylist )
print( "Since we have no data to compare, you need to manually " +
"check data above against given page to ensure correct " +
"working of module!" )
def main(*args):
"""
Handling direct calls --> unittest
"""
# Process global arguments to determine desired site
local_args = pywikibot.handle_args(args)
# Parse command line arguments
for arg in local_args:
if arg.startswith("-page:"):
page = arg[ len("-page:"): ]
# Call unittest-class
test = CountryListUnitTest( page )
test.treat()
if __name__ == "__main__":
main()