From 80c94ccf4f001f91bdef4ee4b9efed954bd2be85 Mon Sep 17 00:00:00 2001 From: Jonathan Golder Date: Sat, 11 Mar 2017 11:30:19 +0100 Subject: [PATCH 1/2] Replace underscores in article titles Remove underscores in article titles and replace with spaces to have canonical state for all articles Therefore we need to split title and posible anchors in heading parser Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=114 FS#114] --- lib/redfam.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/redfam.py b/lib/redfam.py index dc1535d..6abb7ae 100644 --- a/lib/redfam.py +++ b/lib/redfam.py @@ -280,8 +280,21 @@ class RedFamParser( RedFam ): # (Task FS#77) heading = mwparser.parse( str( heading ) ) - # Save destinations of wikilinks in headings - return [ str( link.title ) for link in heading.ifilter_wikilinks() ] + articlesList = [] + for link in heading.ifilter_wikilinks(): + article = str( link.title ) + + # Split in title and anchor part + article = article.split("#", 1) + # Replace underscores in title with spaces + article[0] = article[0].replace("_", " ") + # Rejoin title and anchor + article = "#".join(article) + + # Add to list + articlesList.append(article) + + return articlesList def add_beginning( self, beginning ): """ From 0f930082b4e4a77731557691d7ee4d727a781750 Mon Sep 17 00:00:00 2001 From: Jonathan Golder Date: Sat, 11 Mar 2017 11:40:41 +0100 Subject: [PATCH 2/2] Also canonicalise anchor parts of articles Replace spaces in anchors with underscores as spaces are not correct there Related Task: [https://fs.golderweb.de/index.php?do=details&task_id=114 FS#114] --- lib/redfam.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/redfam.py b/lib/redfam.py index 6abb7ae..6c045c6 100644 --- a/lib/redfam.py +++ b/lib/redfam.py @@ -288,6 +288,11 @@ class RedFamParser( RedFam ): article = article.split("#", 1) # Replace underscores in title with spaces article[0] = article[0].replace("_", " ") + + if len(article) > 1: + # other way round, replace spaces with underscores in anchors + article[1] = article[1].replace(" ", "_") + # Rejoin title and anchor article = "#".join(article)