From 11bfb6807ccd5535e7acfe7a2df0dc0200bf1a22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Sun, 8 Nov 2015 21:16:33 +0100 Subject: [PATCH 1/9] CountryList-Module: Create new class CountryList to move code for handling country list in separate class --- countrylist.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 countrylist.py diff --git a/countrylist.py b/countrylist.py new file mode 100644 index 0000000..40358be --- /dev/null +++ b/countrylist.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# countrylist.py +# +# Copyright 2015 GOLDERWEB – Jonathan Golder +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# +# +""" +Provides a class for handling charts list per country and year +""" + +import locale +from datetime import datetime + +from isoweek import Week + +import pywikibot +import mwparserfromhell as mwparser + + +class CountryList(): + """ + Handles charts list per country and year + """ + + def __init__( self, wikilink ): + """ + Generate new instance of class + + Checks wether page given with country_list_link exists + """ + + self.site = pywikibot.Site() + + # Set locale to 'de_DE.UTF-8' + locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') + + self.page = pywikibot.Page( self.site, wikilink.title ) + + if not self.page.exists(): + return False From 6ae8f4c6ad4deeaa120d6c981f81e922a6521092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 19:59:20 +0100 Subject: [PATCH 2/9] CountryList-Module: Implement basic init method --- countrylist.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/countrylist.py b/countrylist.py index 40358be..14a8a11 100644 --- a/countrylist.py +++ b/countrylist.py @@ -38,20 +38,39 @@ class CountryList(): """ Handles charts list per country and year """ - + def __init__( self, wikilink ): """ Generate new instance of class - + Checks wether page given with country_list_link exists + + @param wikilink Wikilink object by mwparser linking CountryList + + @returns self Object representing CountryList + False if page does not exists """ - + + # Generate pywikibot site object + # @TODO: Maybe store it outside??? self.site = pywikibot.Site() - + # Set locale to 'de_DE.UTF-8' locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') - + + # Generate pywikibot page object self.page = pywikibot.Page( self.site, wikilink.title ) - + + # Store given wikilink for page object + self.wikilink = wikilink + + # Check if page exits if not self.page.exists(): return False + + # Initialise attributes + __attr = ( "wikicode", "entry", "chartein", "_chartein_raw", + "_titel_raw", "titel", "interpret", "_interpret_raw" ) + for attr in __attr: + setattr( self, attr, None ) + From abc30707b53ef87d902295ef0b483038637e6fad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:00:43 +0100 Subject: [PATCH 3/9] CountryList-Module: Implement method for checking if parsing is needed --- countrylist.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/countrylist.py b/countrylist.py index 14a8a11..0aa7bbd 100644 --- a/countrylist.py +++ b/countrylist.py @@ -74,3 +74,17 @@ class CountryList(): for attr in __attr: setattr( self, attr, None ) + def parsing_needed( self, revid ): + """ + Check if current revid of CountryList differs from given one + + @param int Revid to check against + + @return True Given revid differs from current revid + False Given revid is equal to current revid + """ + + if revid != self.page.latest_revision_id: + return True + else: + return False From 4a790912fc1ad6a1194c4f24f7605c0d199f25c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:01:55 +0100 Subject: [PATCH 4/9] CountryList-Module: Implement method for detecting year related to list --- countrylist.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/countrylist.py b/countrylist.py index 0aa7bbd..b5169fc 100644 --- a/countrylist.py +++ b/countrylist.py @@ -74,6 +74,9 @@ class CountryList(): for attr in __attr: setattr( self, attr, None ) + # Try to find year + self.find_year() + def parsing_needed( self, revid ): """ Check if current revid of CountryList differs from given one @@ -88,3 +91,16 @@ class CountryList(): return True else: return False + + def find_year( self ): + """ + Try to find the year related to CountryList + """ + self.year = datetime.now().year + + # Check if year is in page.title, if not try last year + if str( self.year ) not in self.page.title(): + self.year -= 1 + # If last year does not match, raise YearError + if str( self.year ) not in self.page.title(): + raise CountryListYearError From 8858e81ee6c7f536df291b302aa2c507bdaf66d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:04:11 +0100 Subject: [PATCH 5/9] CountryList-Module: Implement methods to get the latest entry of list --- countrylist.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/countrylist.py b/countrylist.py index b5169fc..b0c2a27 100644 --- a/countrylist.py +++ b/countrylist.py @@ -104,3 +104,49 @@ class CountryList(): # If last year does not match, raise YearError if str( self.year ) not in self.page.title(): raise CountryListYearError + + def detect_belgian( self ): + """ + Detect wether current entry is on of the belgian (Belgien/Wallonien) + """ + # Check if begian province name is in link text or title + if "Wallonien" in str( self.wikilink.text ) \ + or "Wallonien" in str( self.wikilink.title): + return "Wallonie" + elif "Flandern" in str( self.wikilink.text ) \ + or "Flandern" in str( self.wikilink.title): + return "Flandern" + else: + return None + + def generate_wikicode( self ): + """ + Runs mwparser on page.text to get mwparser.objects + """ + + self.wikicode = mwparser.parse( self.page.text ) + + def get_latest_entry( self ): + """ + Get latest list entry template object + """ + + # Select the section "Singles" + # For belgian list we need to select subsection of country + belgian = self.detect_belgian() + + if belgian: + singles_section = self.wikicode.get_sections( + matches=belgian )[0].get_sections( matches="Singles" )[0] + else: + singles_section = self.wikicode.get_sections( matches="Singles" )[0] + + # Select the last occurence of template "Nummer-eins-Hits Zeile" in + # "Singles"-section + for self.entry in singles_section.ifilter_templates( + matches="Nummer-eins-Hits Zeile" ): + pass + + # Check if we have found something + if not self.entry: + raise CountryListError( self.page.title() ) From d4ea57dae80d8ea39261b3eb04b32bfb18ec3602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:05:37 +0100 Subject: [PATCH 6/9] CountryList-Module: Implement methods for handling Chartein-Date --- countrylist.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/countrylist.py b/countrylist.py index b0c2a27..0cfe394 100644 --- a/countrylist.py +++ b/countrylist.py @@ -150,3 +150,58 @@ class CountryList(): # Check if we have found something if not self.entry: raise CountryListError( self.page.title() ) + + def get_year_correction( self ): + """ + Reads value of jahr parameter for correcting week numbers near to + year changes + """ + # If param is present return correction, otherwise null + if self.entry.has( "Jahr" ): + + # Read value of param + jahr = self.entry.get( "Jahr" ).strip() + + if jahr == "+1": + return 1 + elif jahr == "-1": + return -1 + + # None or wrong parameter value + return 0 + + def prepare_chartein( self ): + """ + Checks wether self._chartein_raw is a date or a week number and + calculates related datetime object + """ + + # If self._chartein_raw is not set, get it + if not self._chartein_raw: + self.get_chartein_value() + + # Detect weather we have a date or a weeknumber for Template Param + # "Chartein" + # Numeric string means week number + if( self._chartein_raw.isnumeric() ): + + # Calculate date of monday in given week and add number of + # days given in Template parameter "Korrektur" with monday + # as day (zero) + self.chartein = ( Week( self.year + self.get_year_correction(), + int( self._chartein_raw ) ).monday() ) + # Complete date string present + else: + self.chartein = datetime.strptime( self._chartein_raw, + "%Y-%m-%d" ) + + def get_chartein_value( self ): + """ + Reads value of chartein parameter + If param is not present raise Error + """ + if self.entry.has( "Chartein" ): + self._chartein_raw = self.entry.get("Chartein").value.strip() + else: + raise CountryListEntryError( "Template Parameter 'Chartein' is \ +missing!" ) From 87aee8c42a724e8cc627a870e10df71777a602df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:06:36 +0100 Subject: [PATCH 7/9] CountryList-Module: Implment methods for handling Titel-Parameter --- countrylist.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/countrylist.py b/countrylist.py index 0cfe394..fee11ab 100644 --- a/countrylist.py +++ b/countrylist.py @@ -205,3 +205,25 @@ class CountryList(): else: raise CountryListEntryError( "Template Parameter 'Chartein' is \ missing!" ) + + def prepare_titel( self ): + """ + Loads and prepares Titel of latest entry + """ + + # If self._titel_raw is not set, get it + if not self._titel_raw: + self.get_titel_value() + + self.titel = self._titel_raw + + def get_titel_value( self ): + """ + Reads value of Titel parameter + If param is not present raise Error + """ + if self.entry.has( "Titel" ): + self._titel_raw = self.entry.get("Titel").value.strip() + else: + raise CountryListEntryError( "Template Parameter 'Titel' is \ +missing!" ) From 41d3ca95ef4e64e86097e72f8ff752114e0ea93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:07:13 +0100 Subject: [PATCH 8/9] CountryList-Module: Implement methods for handling Interpret-Parameter Including searching for missing links --- countrylist.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/countrylist.py b/countrylist.py index fee11ab..a74e0e3 100644 --- a/countrylist.py +++ b/countrylist.py @@ -227,3 +227,93 @@ missing!" ) else: raise CountryListEntryError( "Template Parameter 'Titel' is \ missing!" ) + + def prepare_interpret( self ): + """ + Loads and prepares Interpret of latest entry + """ + + # If self._interpret_raw is not set, get it + if not self._interpret_raw: + self.get_interpret_value() + + # Work with interpret value to add missing links + # Split it in words + words = self._interpret_raw.split() + + print( words ) + + # Interpret name separating words + seps = ( "feat.", "&" ) + + # Create empty list for concatenated interpret names + parts = [ " ", ] + # Another list for managing indexes which need to be worked on + indexes = list() + index = 0 + + # Reconcatenate interpret names + for word in words: + + # Name parts + if word not in seps: + parts[-1] += (" " + word) + + # Remove unnecessary whitespace + parts[-1] = parts[-1].strip() + + # We only need to work on it, if no wikilink is present + if index not in indexes and "[[" not in parts[-1]: + indexes.append( index ) + else: + # Count up index 2 times ( Separator + next Name ) + index += 2 + parts.append( word ) + parts.append( " " ) + + # If we have indexes with out links, search for links + if indexes: + + # Iterate over wikilinks of refpage and try to find related links + for wikilink in self.wikicode.ifilter_wikilinks(): + + # Iterate over interpret names + for index in indexes: + + # Check wether wikilink matches + if parts[index] == wikilink.text \ + or parts[index] == wikilink.title: + + # Overwrite name with complete wikilink + parts[index] = str( wikilink ) + + # Remove index from worklist + indexes.remove( index ) + + # Other indexes won't also match + break + + # If worklist is empty, stop iterating over wikilinks + if not indexes: + break + + print( parts ) + + # Join the collected links + sep = " " + self.interpret = sep.join( parts ) + + # Nothing to do, just use raw + else: + self.interpret = self._interpret_raw + + def get_interpret_value( self ): + """ + Reads value of Interpret parameter + If param is not present raise Error + """ + if self.entry.has( "Interpret" ): + self._interpret_raw = self.entry.get("Interpret").value.strip() + else: + raise CountryListEntryError( "Template Parameter 'Interpret' is \ +missing!" ) From 2e8b4273e7d9b4288b85a49993b059330f3cee4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?GOLDERWEB=20=E2=80=93=20Jonathan=20Golder?= Date: Mon, 9 Nov 2015 20:09:18 +0100 Subject: [PATCH 9/9] CountryList-Module: Implement parse-method which handles the parsing sequence --- countrylist.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/countrylist.py b/countrylist.py index a74e0e3..18b508c 100644 --- a/countrylist.py +++ b/countrylist.py @@ -105,6 +105,22 @@ class CountryList(): if str( self.year ) not in self.page.title(): raise CountryListYearError + def parse( self ): + """ + Handles the parsing process + """ + + # Parse page with mwparser + self.generate_wikicode() + + # Select lastest entry + self.get_latest_entry() + + # Prepare chartein, titel, interpret + self.prepare_chartein() + self.prepare_titel() + self.prepare_interpret() + def detect_belgian( self ): """ Detect wether current entry is on of the belgian (Belgien/Wallonien) @@ -241,8 +257,6 @@ missing!" ) # Split it in words words = self._interpret_raw.split() - print( words ) - # Interpret name separating words seps = ( "feat.", "&" ) @@ -297,8 +311,6 @@ missing!" ) if not indexes: break - print( parts ) - # Join the collected links sep = " " self.interpret = sep.join( parts )