No Description

countrylist.py 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. #
  4. # countrylist.py
  5. #
  6. # Copyright 2017 Jonathan Golder <jonathan@golderweb.de>
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, write to the Free Software
  20. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  21. # MA 02110-1301, USA.
  22. #
  23. #
  24. """
  25. Provides a class for handling charts list per country and year
  26. """
  27. import re
  28. import locale
  29. from datetime import datetime
  30. from isoweek import Week
  31. import pywikibot
  32. import mwparserfromhell as mwparser
  33. import jogobot
  34. class CountryList():
  35. """
  36. Handles charts list per country and year
  37. """
  38. def __init__( self, wikilink ):
  39. """
  40. Generate new instance of class
  41. Checks wether page given with country_list_link exists
  42. @param wikilink Wikilink object by mwparser linking CountryList
  43. @returns self Object representing CountryList
  44. False if page does not exists
  45. """
  46. # Generate pywikibot site object
  47. # @TODO: Maybe store it outside???
  48. self.site = pywikibot.Site()
  49. # Set locale to 'de_DE.UTF-8'
  50. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  51. # Generate pywikibot page object
  52. self.page = pywikibot.Page( self.site, wikilink.title )
  53. # Store given wikilink for page object
  54. self.wikilink = wikilink
  55. # Check if page exits
  56. if not self.page.exists():
  57. raise CountryListError( "CountryList " +
  58. str(wikilink.title) + " does not exists!" )
  59. # Initialise attributes
  60. __attr = ( "wikicode", "entry", "chartein", "_chartein_raw",
  61. "_titel_raw", "titel", "interpret", "_interpret_raw" )
  62. for attr in __attr:
  63. setattr( self, attr, None )
  64. self.parsed = False
  65. # Try to find year
  66. self.find_year()
  67. def is_parsing_needed( self, revid ):
  68. """
  69. Check if current revid of CountryList differs from given one
  70. @param int Revid to check against
  71. @return True Given revid differs from current revid
  72. False Given revid is equal to current revid
  73. """
  74. if revid != self.page.latest_revision_id:
  75. return True
  76. else:
  77. return False
  78. def find_year( self ):
  79. """
  80. Try to find the year related to CountryList using regex
  81. """
  82. match = re.search( r"^.+\((\d{4})\)", self.page.title() )
  83. # We matched something
  84. if match:
  85. self.year = int(match.group(1))
  86. else:
  87. raise CountryListError( "CountryList year is errorneous!" )
  88. def parse( self ):
  89. """
  90. Handles the parsing process
  91. """
  92. # Set revid
  93. self.revid = self.page.latest_revision_id
  94. # Parse page with mwparser
  95. self.generate_wikicode()
  96. # Select lastest entry
  97. self.get_latest_entry()
  98. # Prepare chartein, titel, interpret
  99. self.prepare_chartein()
  100. self.prepare_titel()
  101. self.prepare_interpret()
  102. # For easy detecting wether we have parsed self
  103. self.parsed = True
  104. # Log parsed page
  105. jogobot.output( "Parsed revision {revid} of page [[{title}]]".format(
  106. revid=self.revid, title=self.page.title() ) )
  107. def detect_belgian( self ):
  108. """
  109. Detect wether current entry is on of the belgian (Belgien/Wallonien)
  110. """
  111. # Check if begian province name is in link text or title
  112. if( "Wallonien" in str( self.wikilink.text ) or
  113. "Wallonien" in str( self.wikilink.title) ):
  114. return "Wallonie"
  115. elif( "Flandern" in str( self.wikilink.text ) or
  116. "Flandern" in str( self.wikilink.title) ):
  117. return "Flandern"
  118. else:
  119. return None
  120. def generate_wikicode( self ):
  121. """
  122. Runs mwparser on page.text to get mwparser.objects
  123. """
  124. self.wikicode = mwparser.parse( self.page.text )
  125. def get_latest_entry( self ):
  126. """
  127. Get latest list entry template object
  128. """
  129. # Select the section "Singles"
  130. # For belgian list we need to select subsection of country
  131. belgian = self.detect_belgian()
  132. # Select Singles-Section
  133. # Catch Error if we have none
  134. try:
  135. if belgian:
  136. singles_section = self.wikicode.get_sections(
  137. matches=belgian )[0].get_sections( matches="Singles" )[0]
  138. else:
  139. singles_section = self.wikicode.get_sections(
  140. matches="Singles" )[0]
  141. except IndexError:
  142. raise CountryListError( "No Singles-Section found!")
  143. # Since we have multiple categories in some countrys we need
  144. # to select the first wrapping template
  145. try:
  146. wrapping = next( singles_section.ifilter_templates(
  147. matches="Nummer-eins-Hits" ) )
  148. except StopIteration:
  149. raise CountryListError( "Wrapping template is missing!")
  150. # Select the last occurence of template "Nummer-eins-Hits Zeile" in
  151. # Wrapper-template
  152. for self.entry in wrapping.get("Inhalt").value.ifilter_templates(
  153. matches="Nummer-eins-Hits Zeile" ):
  154. pass
  155. # Check if we have found something
  156. if not self.entry:
  157. raise CountryListError( self.page.title() )
  158. def get_year_correction( self ):
  159. """
  160. Reads value of jahr parameter for correcting week numbers near to
  161. year changes
  162. """
  163. # If param is present return correction, otherwise null
  164. if self.entry.has( "Jahr" ):
  165. # Read value of param
  166. jahr = self.entry.get( "Jahr" ).strip()
  167. if jahr == "+1":
  168. return 1
  169. elif jahr == "-1":
  170. return -1
  171. # None or wrong parameter value
  172. return 0
  173. def prepare_chartein( self ):
  174. """
  175. Checks wether self._chartein_raw is a date or a week number and
  176. calculates related datetime object
  177. """
  178. # If self._chartein_raw is not set, get it
  179. if not self._chartein_raw:
  180. self.get_chartein_value()
  181. # Detect weather we have a date or a weeknumber for Template Param
  182. # "Chartein"
  183. # Numeric string means week number
  184. if( self._chartein_raw.isnumeric() ):
  185. # Calculate date of monday in given week and add number of
  186. # days given in Template parameter "Korrektur" with monday
  187. # as day (zero)
  188. self.chartein = ( Week( self.year + self.get_year_correction(),
  189. int( self._chartein_raw ) ).monday() )
  190. # Complete date string present
  191. else:
  192. self.chartein = datetime.strptime( self._chartein_raw,
  193. "%Y-%m-%d" )
  194. def get_chartein_value( self ):
  195. """
  196. Reads value of chartein parameter
  197. If param is not present raise Error
  198. """
  199. if self.entry.has( "Chartein" ):
  200. self._chartein_raw = self.entry.get("Chartein").value
  201. # Remove possible ref-tags
  202. for ref in self._chartein_raw.ifilter_tags(matches="ref"):
  203. self._chartein_raw.remove( ref )
  204. # Remove whitespace
  205. self._chartein_raw = str(self._chartein_raw).strip()
  206. else:
  207. raise CountryListEntryError( "Template Parameter 'Chartein' is \
  208. missing!" )
  209. def prepare_titel( self ):
  210. """
  211. Loads and prepares Titel of latest entry
  212. """
  213. # If self._titel_raw is not set, get it
  214. if not self._titel_raw:
  215. self.get_titel_value()
  216. # Try to find a wikilink for Titel on countrylist
  217. if "[[" not in self._titel_raw:
  218. self.titel = self._search_links( str(self._titel_raw) )
  219. else:
  220. self.titel = self._titel_raw
  221. def get_titel_value( self ):
  222. """
  223. Reads value of Titel parameter
  224. If param is not present raise Error
  225. """
  226. if self.entry.has( "Titel" ):
  227. self._titel_raw = self.entry.get("Titel").value
  228. # Only use part before possible "<br"
  229. self.remove_lines(self._titel_raw)
  230. # Remove possible ref-tags
  231. for ref in self._titel_raw.ifilter_tags(matches="ref"):
  232. self._titel_raw.remove( ref )
  233. # Remove whitespace
  234. self._titel_raw = str(self._titel_raw).strip()
  235. else:
  236. raise CountryListEntryError( "Template Parameter 'Titel' is \
  237. missing!" )
  238. def prepare_interpret( self ):
  239. """
  240. Loads and prepares Interpret of latest entry
  241. """
  242. # If self._interpret_raw is not set, get it
  243. if not self._interpret_raw:
  244. self.get_interpret_value()
  245. # Work with interpret value to add missing links
  246. # Split it in words
  247. words = self._interpret_raw.split()
  248. # Interpret name separating words
  249. seps = ( "feat.", "&" )
  250. # Create empty list for concatenated interpret names
  251. parts = [ " ", ]
  252. # Another list for managing indexes which need to be worked on
  253. indexes = list()
  254. index = 0
  255. # Reconcatenate interpret names
  256. for word in words:
  257. # Name parts
  258. if word not in seps:
  259. parts[-1] += (" " + word)
  260. # Remove unnecessary whitespace
  261. parts[-1] = parts[-1].strip()
  262. # We only need to work on it, if no wikilink is present
  263. if index not in indexes and "[[" not in parts[-1]:
  264. indexes.append( index )
  265. else:
  266. # Count up index 2 times ( Separator + next Name )
  267. index += 2
  268. parts.append( word )
  269. parts.append( " " )
  270. # If we have indexes without links, search for links
  271. if indexes:
  272. parts = self._search_links( parts, indexes )
  273. # Join the collected links
  274. sep = " "
  275. self.interpret = sep.join( parts )
  276. # Nothing to do, just use raw
  277. else:
  278. self.interpret = self._interpret_raw
  279. def get_interpret_value( self ):
  280. """
  281. Reads value of Interpret parameter
  282. If param is not present raise Error
  283. """
  284. if self.entry.has( "Interpret" ):
  285. self._interpret_raw = self.entry.get("Interpret").value
  286. # Only use part before possible "<br"
  287. self.remove_lines(self._interpret_raw)
  288. # Remove possible ref-tags
  289. for ref in self._interpret_raw.ifilter_tags(matches="ref"):
  290. self._interpret_raw.remove( ref )
  291. # Handle SortKeyName and SortKey
  292. for template in self._interpret_raw.ifilter_templates(
  293. matches="SortKey" ):
  294. if template.name == "SortKeyName":
  295. # Differing Link-Destination is provided as param 3
  296. if template.has(3):
  297. # Construct link out of Template, Params:
  298. # 1 = Surname
  299. # 2 = Name
  300. # 3 = Link-Dest
  301. interpret_link = mwparser.nodes.wikilink.Wikilink(
  302. str(template.get(3).value),
  303. str(template.get(1).value) + " " +
  304. str(template.get(2).value) )
  305. # Default Link-Dest [[Surname Name]]
  306. else:
  307. interpret_link = mwparser.nodes.wikilink.Wikilink(
  308. str(template.get(1).value) + " " +
  309. str(template.get(2).value) )
  310. # Replace Template with link
  311. self._interpret_raw.replace( template, interpret_link )
  312. # SortKey
  313. else:
  314. # Replace SortKey with text from param 2 if present
  315. if template.has(2):
  316. self._interpret_raw.replace( template,
  317. template.get(2).value)
  318. # Else Remove SortKey (text should follow behind SortKey)
  319. else:
  320. self._interpret_raw.replace( template, None)
  321. # Normally won't be needed as there should be only one
  322. # SortKey-Temlate but ... its a wiki
  323. break
  324. # Remove whitespace
  325. self._interpret_raw = str(self._interpret_raw).strip()
  326. else:
  327. raise CountryListEntryError( "Template Parameter 'Interpret' is \
  328. missing!" )
  329. def _search_links( self, keywords, indexes=None ):
  330. """
  331. Search matching wikilinks for keyword(s) in CountryList's wikicode
  332. @param keywords: One or more keywords to search for
  333. @type keywords: str, list
  334. @param indexes: List with numeric indexes for items of keywords to work
  335. on only
  336. @type indexes: list of ints
  337. @return: List or String with replaced keywords
  338. @return type: str, list
  339. """
  340. # Maybe convert keywords string to list
  341. if( isinstance( keywords, str ) ):
  342. keywords = [ keywords, ]
  343. string = True
  344. else:
  345. string = False
  346. # If indexes worklist was not provided, work on all elements
  347. if not indexes:
  348. indexes = list(range( len( keywords ) ))
  349. # Iterate over wikilinks of refpage and try to find related links
  350. for wikilink in self.wikicode.ifilter_wikilinks():
  351. # Iterate over interpret names
  352. for index in indexes:
  353. # Check wether wikilink matches
  354. if( keywords[index] == wikilink.text or
  355. keywords[index] == wikilink.title ):
  356. # Overwrite name with complete wikilink
  357. keywords[index] = str( wikilink )
  358. # Remove index from worklist
  359. indexes.remove( index )
  360. # Other indexes won't also match
  361. break
  362. # If worklist is empty, stop iterating over wikilinks
  363. if not indexes:
  364. break
  365. # Choose wether return list or string based on input type
  366. if not string:
  367. return keywords
  368. else:
  369. return str(keywords[0])
  370. def remove_lines(self, wikicode):
  371. """
  372. Removes linebreaks (<br>) and everything after them in given wikicode
  373. """
  374. # Catch wrong typed param
  375. if not isinstance(wikicode, mwparser.wikicode.Wikicode):
  376. raise TypeError(str(type(self)) + "._remove_lines() expects " +
  377. "parameter 'wikicode' of type " +
  378. "'mwparserfromhell.wikicode.Wikicode', " +
  379. str(type(wikicode)) + " was given!")
  380. # Find first linebreak
  381. br = next(wikicode.ifilter_tags(matches="br"), None)
  382. # If there is one, get its position and slice nodes-list
  383. if br:
  384. brpos = wikicode.nodes.index(br)
  385. wikicode.nodes = wikicode.nodes[0:brpos]
  386. def __str__( self ):
  387. """
  388. Returns str repression for Object
  389. """
  390. if self.parsed:
  391. return ("CountryList( Link = \"{link}\", Revid = \"{revid}\", " +
  392. "Interpret = \"{interpret}\", Titel = \"{titel}\", " +
  393. "Chartein = \"{chartein}\" )").format(
  394. link=repr(self.wikilink),
  395. revid=self.revid,
  396. interpret=self.interpret,
  397. titel=self.titel,
  398. chartein=repr(self.chartein))
  399. else:
  400. return "CountryList( Link = \"{link}\" )".format(
  401. link=repr(self.wikilink))
  402. class CountryListError( Exception ):
  403. """
  404. Handles errors occuring in class CountryList
  405. """
  406. pass
  407. class CountryListEntryError( CountryListError ):
  408. """
  409. Handles errors occuring in class CountryList related to entrys
  410. """
  411. pass
  412. class CountryListUnitTest():
  413. """
  414. Defines Test-Functions for CountryList-Module
  415. """
  416. testcases = ( { "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Frankreich (2015)" ), # noqa
  417. "revid": 148453827,
  418. "interpret": "[[Adele (Sängerin)|Adele]]",
  419. "titel": "[[Hello (Adele-Lied)|Hello]]",
  420. "chartein": datetime( 2015, 10, 23 ) },
  421. { "Link": mwparser.nodes.Wikilink( "Benutzer:JogoBot/Charts/Tests/Liste der Nummer-eins-Hits in Belgien (2015)", "Wallonien"), # noqa
  422. "revid": 148455281,
  423. "interpret": "[[Nicky Jam]] & [[Enrique Iglesias (Sänger)|Enrique Iglesias]]", # noqa
  424. "titel": "El perdón",
  425. "chartein": datetime( 2015, 9, 12 ) } )
  426. def __init__( self, page=None ):
  427. """
  428. Constructor
  429. Set attribute page
  430. """
  431. if page:
  432. self.page_link = mwparser.nodes.Wikilink( page )
  433. else:
  434. self.page_link = None
  435. def treat( self ):
  436. """
  437. Start testing either manually with page provided by cmd-arg page or
  438. automatically with predefined test case
  439. """
  440. if self.page_link:
  441. self.man_test()
  442. else:
  443. self.auto_test()
  444. def auto_test( self ):
  445. """
  446. Run automatic tests with predefined test data from wiki
  447. """
  448. for case in type(self).testcases:
  449. self.countrylist = CountryList( case["Link"] )
  450. if( self.countrylist.is_parsing_needed( case["revid"] ) or not
  451. self.countrylist.is_parsing_needed( case["revid"] + 1 ) ):
  452. raise Exception(
  453. "CountryList.is_parsing_needed() does not work!" )
  454. self.countrylist.parse()
  455. for key in case:
  456. if key == "Link":
  457. continue
  458. if not case[key] == getattr(self.countrylist, key ):
  459. raise Exception( key + " – " + str(
  460. getattr(self.countrylist, key ) ))
  461. def man_test( self ):
  462. """
  463. Run manual test with page given in parameter
  464. """
  465. self.countrylist = CountryList( self.page_link )
  466. self.countrylist.parse()
  467. print( self.countrylist )
  468. print( "Since we have no data to compare, you need to manually " +
  469. "check data above against given page to ensure correct " +
  470. "working of module!" )
  471. def main(*args):
  472. """
  473. Handling direct calls --> unittest
  474. """
  475. # Process global arguments to determine desired site
  476. local_args = pywikibot.handle_args(args)
  477. # Parse command line arguments
  478. for arg in local_args:
  479. if arg.startswith("-page:"):
  480. page = arg[ len("-page:"): ]
  481. # Call unittest-class
  482. test = CountryListUnitTest( page )
  483. test.treat()
  484. if __name__ == "__main__":
  485. main()