includes/site/MediaWikiPageNameNormalizer.php

   1 <?php
   2
   3 namespace MediaWiki\Site;
   4
   5 use FormatJson;
   6 use Http;
   7 use UtfNormal\Validator;
   8
   9 /**
  10  * Service for normalizing a page name using a MediaWiki api.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * This program is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License along
  23  * with this program; if not, write to the Free Software Foundation, Inc.,
  24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  25  * http://www.gnu.org/copyleft/gpl.html
  26  *
  27  * @since 1.27
  28  *
  29  * @license GNU GPL v2+
  30  * @author John Erling Blad < jeblad@gmail.com >
  31  * @author Daniel Kinzler
  32  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  33  * @author Marius Hoch
  34  */
  35 class MediaWikiPageNameNormalizer {
  36
  37         /**
  38          * Returns the normalized form of the given page title, using the
  39          * normalization rules of the given site. If the given title is a redirect,
  40          * the redirect weill be resolved and the redirect target is returned.
  41          *
  42          * @note This actually makes an API request to the remote site, so beware
  43          *   that this function is slow and depends on an external service.
  44          *
  45          * @see Site::normalizePageName
  46          *
  47          * @since 1.27
  48          *
  49          * @param string $pageName
  50          * @param string $apiUrl
  51          *
  52          * @return string
  53          * @throws \MWException
  54          */
  55         public function normalizePageName( $pageName, $apiUrl ) {
  56
  57                 // Check if we have strings as arguments.
  58                 if ( !is_string( $pageName ) ) {
  59                         throw new \MWException( '$pageName must be a string' );
  60                 }
  61
  62                 // Go on call the external site
  63
  64                 // Make sure the string is normalized into NFC (due to T42017)
  65                 // but do nothing to the whitespaces, that should work appropriately.
  66                 // @see https://phabricator.wikimedia.org/T42017
  67                 $pageName = Validator::cleanUp( $pageName );
  68
  69                 // Build the args for the specific call
  70                 $args = array(
  71                         'action' => 'query',
  72                         'prop' => 'info',
  73                         'redirects' => true,
  74                         'converttitles' => true,
  75                         'format' => 'json',
  76                         'titles' => $pageName,
  77                         // @todo options for maxlag and maxage
  78                         // Note that maxlag will lead to a long delay before a reply is made,
  79                         // but that maxage can avoid the extreme delay. On the other hand
  80                         // maxage could be nice to use anyhow as it stops unnecessary requests.
  81                         // Also consider smaxage if maxage is used.
  82                 );
  83
  84                 $url = wfAppendQuery( $apiUrl, $args );
  85
  86                 // Go on call the external site
  87                 // @todo we need a good way to specify a timeout here.
  88                 $ret = Http::get( $url, array(), __METHOD__ );
  89
  90                 if ( $ret === false ) {
  91                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
  92                         return false;
  93                 }
  94
  95                 $data = FormatJson::decode( $ret, true );
  96
  97                 if ( !is_array( $data ) ) {
  98                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
  99                         return false;
 100                 }
 101
 102                 $page = static::extractPageRecord( $data, $pageName );
 103
 104                 if ( isset( $page['missing'] ) ) {
 105                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
 106                                 . $ret );
 107                         return false;
 108                 }
 109
 110                 if ( isset( $page['invalid'] ) ) {
 111                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
 112                                 . $ret );
 113                         return false;
 114                 }
 115
 116                 if ( !isset( $page['title'] ) ) {
 117                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 118                         return false;
 119                 }
 120
 121                 return $page['title'];
 122         }
 123
 124         /**
 125          * Get normalization record for a given page title from an API response.
 126          *
 127          * @param array $externalData A reply from the API on a external server.
 128          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 129          *
 130          * @return array|bool A 'page' structure representing the page identified by $pageTitle.
 131          */
 132         private static function extractPageRecord( $externalData, $pageTitle ) {
 133                 // If there is a special case with only one returned page
 134                 // we can cheat, and only return
 135                 // the single page in the "pages" substructure.
 136                 if ( isset( $externalData['query']['pages'] ) ) {
 137                         $pages = array_values( $externalData['query']['pages'] );
 138                         if ( count( $pages ) === 1 ) {
 139                                 return $pages[0];
 140                         }
 141                 }
 142                 // This is only used during internal testing, as it is assumed
 143                 // a more optimal (and lossfree) storage.
 144                 // Make initial checks and return if prerequisites are not meet.
 145                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 146                         return false;
 147                 }
 148                 // Loop over the tree different named structures, that otherwise are similar
 149                 $structs = array(
 150                         'normalized' => 'from',
 151                         'converted' => 'from',
 152                         'redirects' => 'from',
 153                         'pages' => 'title'
 154                 );
 155                 foreach ( $structs as $listId => $fieldId ) {
 156                         // Check if the substructure exist at all.
 157                         if ( !isset( $externalData['query'][$listId] ) ) {
 158                                 continue;
 159                         }
 160                         // Filter the substructure down to what we actually are using.
 161                         $collectedHits = array_filter(
 162                                 array_values( $externalData['query'][$listId] ),
 163                                 function ( $a ) use ( $fieldId, $pageTitle ) {
 164                                         return $a[$fieldId] === $pageTitle;
 165                                 }
 166                         );
 167                         // If still looping over normalization, conversion or redirects,
 168                         // then we need to keep the new page title for later rounds.
 169                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 170                                 switch ( count( $collectedHits ) ) {
 171                                         case 0:
 172                                                 break;
 173                                         case 1:
 174                                                 $pageTitle = $collectedHits[0]['to'];
 175                                                 break;
 176                                         default:
 177                                                 return false;
 178                                 }
 179                         } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 180                                 // If on the pages structure we should prepare for returning.
 181
 182                                 switch ( count( $collectedHits ) ) {
 183                                         case 0:
 184                                                 return false;
 185                                         case 1:
 186                                                 return array_shift( $collectedHits );
 187                                         default:
 188                                                 return false;
 189                                 }
 190                         }
 191                 }
 192                 // should never be here
 193                 return false;
 194         }
 195
 196 }