|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- #
- # countrylist.py
- #
- # Copyright 2017 Jonathan Golder <jonathan@golderweb.de>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- # MA 02110-1301, USA.
- #
- #
- """
- Provides a class for handling charts list per country and year
- """
-
- import re
- import locale
- from datetime import datetime
-
- from isoweek import Week
-
- import pywikibot
- import mwparserfromhell as mwparser
-
- import jogobot
-
-
- class CountryList():
- """
- Handles charts list per country and year
- """
-
- def __init__( self, wikilink ):
- """
- Generate new instance of class
-
- Checks wether page given with country_list_link exists
-
- @param wikilink Wikilink object by mwparser linking CountryList
-
- @returns self Object representing CountryList
- False if page does not exists
- """
-
- # Generate pywikibot site object
- # @TODO: Maybe store it outside???
- self.site = pywikibot.Site()
-
- # Set locale to 'de_DE.UTF-8'
- locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
-
- # Generate pywikibot page object
- self.page = pywikibot.Page( self.site, wikilink.title )
-
- # Store given wikilink for page object
- self.wikilink = wikilink
-
- # Check if page exits
- if not self.page.exists():
- raise CountryListError( "CountryList " +
- str(wikilink.title) + " does not exists!" )
-
- # Initialise attributes
- __attr = ( "wikicode", "entry", "chartein", "_chartein_raw",
- "_titel_raw", "titel", "interpret", "_interpret_raw" )
- for attr in __attr:
- setattr( self, attr, None )
-
- self.parsed = False
-
- # Try to find year
- self.find_year()
-
- def is_parsing_needed( self, revid ):
- """
- Check if current revid of CountryList differs from given one
-
- @param int Revid to check against
-
- @return True Given revid differs from current revid
- False Given revid is equal to current revid
- """
-
- if revid != self.page.latest_revision_id:
- return True
- else:
- return False
-
- def find_year( self ):
- """
- Try to find the year related to CountryList using regex
- """
- match = re.search( r"^.+\((\d{4})\)", self.page.title() )
-
- # We matched something
- if match:
- self.year = int(match.group(1))
-
- else:
- raise CountryListError( "CountryList year is errorneous!" )
-
- def parse( self ):
- """
- Handles the parsing process
- """
-
- # Set revid
- self.revid = self.page.latest_revision_id
-
- # Parse page with mwparser
- self.generate_wikicode()
-
- # Select lastest entry
- self.get_latest_entry()
-
- # Prepare chartein, titel, interpret
- self.prepare_chartein()
- self.prepare_titel()
- self.prepare_interpret()
-
- # For easy detecting wether we have parsed self
- self.parsed = True
-
- # Log parsed page
- jogobot.output( "Parsed revision {revid} of page [[{title}]]".format(
- revid=self.revid, title=self.page.title() ) )
-
- def detect_belgian( self ):
- """
- Detect wether current entry is on of the belgian (Belgien/Wallonien)
- """
- # Check if begian province name is in link text or title
- if( "Wallonien" in str( self.wikilink.text ) or
- "Wallonien" in str( self.wikilink.title) ):
- return "Wallonie"
- elif( "Flandern" in str( self.wikilink.text ) or
- "Flandern" in str( self.wikilink.title) ):
- return "Flandern"
- else:
- return None
-
- def generate_wikicode( self ):
- """
- Runs mwparser on page.text to get mwparser.objects
- """
-
- self.wikicode = mwparser.parse( self.page.text )
-
- def get_latest_entry( self ):
- """
- Get latest list entry template object
- """
-
- # Select the section "Singles"
- # For belgian list we need to select subsection of country
- belgian = self.detect_belgian()
-
- # Select Singles-Section
- # Catch Error if we have none
- try:
- if belgian:
- singles_section = self.wikicode.get_sections(
- matches=belgian )[0].get_sections( matches="Singles" )[0]
- else:
- singles_section = self.wikicode.get_sections(
- matches="Singles" )[0]
-
- except IndexError:
- raise CountryListError( "No Singles-Section found!")
-
- # Since we have multiple categories in some countrys we need
- # to select the first wrapping template
- try:
- wrapping = next( singles_section.ifilter_templates(
- matches="Nummer-eins-Hits" ) )
- except StopIteration:
- raise CountryListError( "Wrapping template is missing!")
-
- # Select the last occurence of template "Nummer-eins-Hits Zeile" in
- # Wrapper-template
- for self.entry in wrapping.get("Inhalt").value.ifilter_templates(
- matches="Nummer-eins-Hits Zeile" ):
- pass
-
- # Check if we have found something
- if not self.entry:
- raise CountryListError( self.page.title() )
-
- def get_year_correction( self ):
- """
- Reads value of jahr parameter for correcting week numbers near to
- year changes
- """
- # If param is present return correction, otherwise null
- if self.entry.has( "Jahr" ):
-
- # Read value of param
- jahr = self.entry.get( "Jahr" ).strip()
-
- if jahr == "+1":
- return 1
- elif jahr == "-1":
- return -1
-
- # None or wrong parameter value
- return 0
-
- def prepare_chartein( self ):
- """
- Checks wether self._chartein_raw is a date or a week number and
- calculates related datetime object
- """
-
- # If self._chartein_raw is not set, get it
- if not self._chartein_raw:
- self.get_chartein_value()
-
- # Detect weather we have a date or a weeknumber for Template Param
- # "Chartein"
- # Numeric string means week number
- if( self._chartein_raw.isnumeric() ):
-
- # Calculate date of monday in given week and add number of
- # days given in Template parameter "Korrektur" with monday
- # as day (zero)
- self.chartein = ( Week( self.year + self.get_year_correction(),
- int( self._chartein_raw ) ).monday() )
- # Complete date string present
- else:
- self.chartein = datetime.strptime( self._chartein_raw,
- "%Y-%m-%d" )
-
- def get_chartein_value( self ):
- """
- Reads value of chartein parameter
- If param is not present raise Error
- """
- if self.entry.has( "Chartein" ):
- self._chartein_raw = self.entry.get("Chartein").value
-
- # Remove possible ref-tags
- for ref in self._chartein_raw.ifilter_tags(matches="ref"):
- self._chartein_raw.remove( ref )
-
- # Remove whitespace
- self._chartein_raw = str(self._chartein_raw).strip()
-
- else:
- raise CountryListEntryError( "Template Parameter 'Chartein' is \
- missing!" )
-
- def prepare_titel( self ):
- """
- Loads and prepares Titel of latest entry
- """
-
- # If self._titel_raw is not set, get it
- if not self._titel_raw:
- self.get_titel_value()
-
- # Try to find a wikilink for Titel on countrylist
- if "[[" not in self._titel_raw:
- self.titel = self._search_links( str(self._titel_raw) )
- else:
- self.titel = self._titel_raw
-
- def get_titel_value( self ):
- """
- Reads value of Titel parameter
- If param is not present raise Error
- """
- if self.entry.has( "Titel" ):
- self._titel_raw = self.entry.get("Titel").value
-
- # Only use part before possible "<br"
- self.remove_lines(self._titel_raw)
-
- # Remove possible ref-tags
- for ref in self._titel_raw.ifilter_tags(matches="ref"):
- self._titel_raw.remove( ref )
-
- # Remove whitespace
- self._titel_raw = str(self._titel_raw).strip()
- else:
- raise CountryListEntryError( "Template Parameter 'Titel' is \
- missing!" )
-
- def prepare_interpret( self ):
- """
- Loads and prepares Interpret of latest entry
- """
-
- # If self._interpret_raw is not set, get it
- if not self._interpret_raw:
- self.get_interpret_value()
-
- # Work with interpret value to add missing links
- # Split it in words
- words = self._interpret_raw.split()
-
- # Interpret name separating words
- seps = ( "feat.", "&" )
-
- # Create empty list for concatenated interpret names
- parts = [ " ", ]
- # Another list for managing indexes which need to be worked on
- indexes = list()
- index = 0
-
- # Reconcatenate interpret names
- for word in words:
-
- # Name parts
- if word not in seps:
- parts[-1] += (" " + word)
-
- # Remove unnecessary whitespace
- parts[-1] = parts[-1].strip()
-
- # We only need to work on it, if no wikilink is present
- if index not in indexes and "[[" not in parts[-1]:
- indexes.append( index )
- else:
- # Count up index 2 times ( Separator + next Name )
- index += 2
- parts.append( word )
- parts.append( " " )
-
- # If we have indexes without links, search for links
- if indexes:
-
- parts = self._search_links( parts, indexes )
-
- # Join the collected links
- sep = " "
- self.interpret = sep.join( parts )
-
- # Nothing to do, just use raw
- else:
- self.interpret = self._interpret_raw
-
- def get_interpret_value( self ):
- """
- Reads value of Interpret parameter
- If param is not present raise Error
- """
- if self.entry.has( "Interpret" ):
- self._interpret_raw = self.entry.get("Interpret").value
-
- # Only use part before possible "<br"
- self.remove_lines(self._interpret_raw)
-
- # Remove possible ref-tags
- for ref in self._interpret_raw.ifilter_tags(matches="ref"):
- self._interpret_raw.remove( ref )
-
- # Handle SortKeyName and SortKey
- for template in self._interpret_raw.ifilter_templates(
- matches="SortKey" ):
-
- if template.name == "SortKeyName":
- # Differing Link-Destination is provided as param 3
- if template.has(3):
- # Construct link out of Template, Params:
- # 1 = Surname
- # 2 = Name
- # 3 = Link-Dest
- interpret_link = mwparser.nodes.wikilink.Wikilink(
- str(template.get(3).value),
- str(template.get(1).value) + " " +
- str(template.get(2).value) )
-
- # Default Link-Dest [[Surname Name]]
- else:
- interpret_link = mwparser.nodes.wikilink.Wikilink(
- str(template.get(1).value) + " " +
- str(template.get(2).value) )
-
- # Replace Template with link
- self._interpret_raw.replace( template, interpret_link )
-
- # SortKey
- else:
- # Replace SortKey with text from param 2 if present
- if template.has(2):
- self._interpret_raw.replace( template,
- template.get(2).value)
- # Else Remove SortKey (text should follow behind SortKey)
- else:
- self._interpret_raw.replace( template, None)
-
- # Normally won't be needed as there should be only one
- # SortKey-Temlate but ... its a wiki
- break
-
- # Remove whitespace
- self._interpret_raw = str(self._interpret_raw).strip()
- else:
- raise CountryListEntryError( "Template Parameter 'Interpret' is \
- missing!" )
-
- def _search_links( self, keywords, indexes=None ):
- """
- Search matching wikilinks for keyword(s) in CountryList's wikicode
-
- @param keywords: One or more keywords to search for
- @type keywords: str, list
- @param indexes: List with numeric indexes for items of keywords to work
- on only
- @type indexes: list of ints
- @return: List or String with replaced keywords
- @return type: str, list
- """
-
- # Maybe convert keywords string to list
- if( isinstance( keywords, str ) ):
- keywords = [ keywords, ]
- string = True
- else:
- string = False
-
- # If indexes worklist was not provided, work on all elements
- if not indexes:
- indexes = list(range( len( keywords ) ))
-
- # Iterate over wikilinks of refpage and try to find related links
- for wikilink in self.wikicode.ifilter_wikilinks():
-
- # Iterate over interpret names
- for index in indexes:
-
- # Check wether wikilink matches
- if( keywords[index] == wikilink.text or
- keywords[index] == wikilink.title ):
-
- # Overwrite name with complete wikilink
- keywords[index] = str( wikilink )
-
- # Remove index from worklist
- indexes.remove( index )
-
- # Other indexes won't also match
- break
-
- # If worklist is empty, stop iterating over wikilinks
- if not indexes:
- break
-
- # Choose wether return list or string based on input type
- if not string:
- return keywords
- else:
- return str(keywords[0])
-
- def remove_lines(self, wikicode):
- """
- Removes linebreaks (<br>) and everything after them in given wikicode
- """
- # Catch wrong typed param
- if not isinstance(wikicode, mwparser.wikicode.Wikicode):
- raise TypeError(str(type(self)) + "._remove_lines() expects " +
- "parameter 'wikicode' of type " +
- "'mwparserfromhell.wikicode.Wikicode', " +
- str(type(wikicode)) + " was given!")
-
- # Find first linebreak
- br = next(wikicode.ifilter_tags(matches="br"), None)
-
- # If there is one, get its position and slice nodes-list
- if br:
- brpos = wikicode.nodes.index(br)
- wikicode.nodes = wikicode.nodes[0:brpos]
-
- def __str__( self ):
- """
- Returns str repression for Object
- """
- if self.parsed:
- return ("CountryList( Link = \"{link}\", Revid = \"{revid}\", " +
- "Interpret = \"{interpret}\", Titel = \"{titel}\", " +
- "Chartein = \"{chartein}\" )").format(
- link=repr(self.wikilink),
- revid=self.revid,
- interpret=self.interpret,
- titel=self.titel,
- chartein=repr(self.chartein))
- else:
- return "CountryList( Link = \"{link}\" )".format(
- link=repr(self.wikilink))
-
-
- class CountryListError( Exception ):
- """
- Handles errors occuring in class CountryList
- """
- pass
-
-
- class CountryListEntryError( CountryListError ):
- """
- Handles errors occuring in class CountryList related to entrys
- """
- pass
-
-
- class CountryListUnitTest():
- """
- Defines Test-Functions for CountryList-Module
- """
-
- testcases = ( { "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Frankreich (2015)" ), # noqa
- "revid": 148453827,
- "interpret": "[[Adele (Sängerin)|Adele]]",
- "titel": "[[Hello (Adele-Lied)|Hello]]",
- "chartein": datetime( 2015, 10, 23 ) },
- { "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Belgien (2015)", "Wallonien"), # noqa
- "revid": 148455281,
- "interpret": "[[Nicky Jam]] & [[Enrique Iglesias (Sänger)|Enrique Iglesias]]", # noqa
- "titel": "El perdón",
- "chartein": datetime( 2015, 9, 12 ) } )
-
- def __init__( self, page=None ):
- """
- Constructor
- Set attribute page
- """
- if page:
- self.page_link = mwparser.nodes.Wikilink( page )
- else:
- self.page_link = None
-
- def treat( self ):
- """
- Start testing either manually with page provided by cmd-arg page or
- automatically with predefined test case
- """
- if self.page_link:
- self.man_test()
- else:
- self.auto_test()
-
- def auto_test( self ):
- """
- Run automatic tests with predefined test data from wiki
- """
-
- for case in type(self).testcases:
-
- self.countrylist = CountryList( case["Link"] )
-
- if( self.countrylist.is_parsing_needed( case["revid"] ) or not
- self.countrylist.is_parsing_needed( case["revid"] + 1 ) ):
- raise Exception(
- "CountryList.is_parsing_needed() does not work!" )
-
- self.countrylist.parse()
-
- for key in case:
-
- if key == "Link":
- continue
-
- if not case[key] == getattr(self.countrylist, key ):
- raise Exception( key + " – " + str(
- getattr(self.countrylist, key ) ))
-
- def man_test( self ):
- """
- Run manual test with page given in parameter
- """
- self.countrylist = CountryList( self.page_link )
-
- self.countrylist.parse()
-
- print( self.countrylist )
- print( "Since we have no data to compare, you need to manually " +
- "check data above against given page to ensure correct " +
- "working of module!" )
-
-
- def main(*args):
- """
- Handling direct calls --> unittest
- """
- # Process global arguments to determine desired site
- local_args = pywikibot.handle_args(args)
-
- # Parse command line arguments
- for arg in local_args:
- if arg.startswith("-page:"):
- page = arg[ len("-page:"): ]
-
- # Call unittest-class
- test = CountryListUnitTest( page )
- test.treat()
-
- if __name__ == "__main__":
- main()
|