includes/site/MediaWikiPageNameNormalizer.php

   1 <?php
   2 /**
   3  * This program is free software; you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation; either version 2 of the License, or
   6  * (at your option) any later version.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License along
  14  * with this program; if not, write to the Free Software Foundation, Inc.,
  15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16  * http://www.gnu.org/copyleft/gpl.html
  17  *
  18  * @file
  19  */
  20
  21 namespace MediaWiki\Site;
  22
  23 use InvalidArgumentException;
  24 use MediaWiki\Http\HttpRequestFactory;
  25 use MediaWiki\Json\FormatJson;
  26 use MediaWiki\MediaWikiServices;
  27 use UtfNormal\Validator;
  28
  29 /**
  30  * Service for normalizing a page name via a MediaWiki action API.
  31  *
  32  * @since 1.27
  33  * @author John Erling Blad < jeblad@gmail.com >
  34  * @author Daniel Kinzler
  35  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  36  * @author Marius Hoch
  37  */
  38 class MediaWikiPageNameNormalizer {
  39
  40         public const FOLLOW_REDIRECT = 1;
  41         public const NOFOLLOW_REDIRECT = 2;
  42
  43         /**
  44          * @var HttpRequestFactory
  45          */
  46         private $httpRequestFactory;
  47
  48         /**
  49          * @param HttpRequestFactory|null $httpRequestFactory
  50          */
  51         public function __construct( $httpRequestFactory = null ) {
  52                 if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
  53                         $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
  54                 }
  55                 $this->httpRequestFactory = $httpRequestFactory;
  56         }
  57
  58         /**
  59          * Returns the normalized form of the given page title, using the
  60          * normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default)
  61          * and the given title is a redirect, the redirect will be resolved and
  62          * the redirect target is returned.
  63          * Only titles of existing pages will be returned.
  64          *
  65          * @note This actually makes an API request to the remote site, so beware
  66          *   that this function is slow and depends on an external service.
  67          *
  68          * @see Site::normalizePageName
  69          *
  70          * @since 1.27
  71          * @since 1.37 Added $followRedirect
  72          *
  73          * @param string $pageName
  74          * @param string $apiUrl
  75          * @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT
  76          *
  77          * @return string|false The normalized form of the title,
  78          * or false to indicate an invalid title, a missing page,
  79          * or some other kind of error.
  80          */
  81         public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
  82                 if ( $followRedirect === self::FOLLOW_REDIRECT ) {
  83                         $redirects = true;
  84                 } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
  85                         $redirects = false;
  86                 } else {
  87                         throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
  88                 }
  89
  90                 // Go on call the external site
  91
  92                 // Make sure the string is normalized into NFC (due to T42017)
  93                 // but do nothing to the whitespaces, that should work appropriately.
  94                 // @see https://phabricator.wikimedia.org/T42017
  95                 $pageName = Validator::cleanUp( $pageName );
  96
  97                 // Build the args for the specific call
  98                 $args = [
  99                         'action' => 'query',
 100                         'prop' => 'info',
 101                         'redirects' => $redirects,
 102                         'converttitles' => true,
 103                         'format' => 'json',
 104                         'titles' => $pageName,
 105                         // @todo options for maxlag and maxage
 106                         // Note that maxlag will lead to a long delay before a reply is made,
 107                         // but that maxage can avoid the extreme delay. On the other hand
 108                         // maxage could be nice to use anyhow as it stops unnecessary requests.
 109                         // Also consider smaxage if maxage is used.
 110                 ];
 111
 112                 $url = wfAppendQuery( $apiUrl, $args );
 113
 114                 // Go on call the external site
 115                 // @todo we need a good way to specify a timeout here.
 116                 $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
 117
 118                 if ( $ret === null ) {
 119                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 120                         return false;
 121                 }
 122
 123                 $data = FormatJson::decode( $ret, true );
 124
 125                 if ( !is_array( $data ) ) {
 126                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 127                         return false;
 128                 }
 129
 130                 $page = static::extractPageRecord( $data, $pageName );
 131
 132                 if ( isset( $page['missing'] ) ) {
 133                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
 134                                 . $ret );
 135                         return false;
 136                 }
 137
 138                 if ( isset( $page['invalid'] ) ) {
 139                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
 140                                 . $ret );
 141                         return false;
 142                 }
 143
 144                 if ( !isset( $page['title'] ) ) {
 145                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 146                         return false;
 147                 }
 148
 149                 return $page['title'];
 150         }
 151
 152         /**
 153          * Get normalization record for a given page title from an API response.
 154          *
 155          * @param array $externalData A reply from the API on a external server.
 156          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 157          *
 158          * @return array|bool A 'page' structure representing the page identified by $pageTitle.
 159          */
 160         private static function extractPageRecord( $externalData, $pageTitle ) {
 161                 // If there is a special case with only one returned page
 162                 // we can cheat, and only return
 163                 // the single page in the "pages" substructure.
 164                 if ( isset( $externalData['query']['pages'] ) ) {
 165                         $pages = array_values( $externalData['query']['pages'] );
 166                         if ( count( $pages ) === 1 ) {
 167                                 return $pages[0];
 168                         }
 169                 }
 170                 // This is only used during internal testing, as it is assumed
 171                 // a more optimal (and lossfree) storage.
 172                 // Make initial checks and return if prerequisites are not meet.
 173                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 174                         return false;
 175                 }
 176                 // Loop over the tree different named structures, that otherwise are similar
 177                 $structs = [
 178                         'normalized' => 'from',
 179                         'converted' => 'from',
 180                         'redirects' => 'from',
 181                         'pages' => 'title'
 182                 ];
 183                 foreach ( $structs as $listId => $fieldId ) {
 184                         // Check if the substructure exist at all.
 185                         if ( !isset( $externalData['query'][$listId] ) ) {
 186                                 continue;
 187                         }
 188                         // Filter the substructure down to what we actually are using.
 189                         $collectedHits = array_filter(
 190                                 array_values( $externalData['query'][$listId] ),
 191                                 static function ( $a ) use ( $fieldId, $pageTitle ) {
 192                                         return $a[$fieldId] === $pageTitle;
 193                                 }
 194                         );
 195                         // If still looping over normalization, conversion or redirects,
 196                         // then we need to keep the new page title for later rounds.
 197                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 198                                 switch ( count( $collectedHits ) ) {
 199                                         case 0:
 200                                                 break;
 201                                         case 1:
 202                                                 $pageTitle = $collectedHits[0]['to'];
 203                                                 break;
 204                                         default:
 205                                                 return false;
 206                                 }
 207                         } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 208                                 // If on the pages structure we should prepare for returning.
 209
 210                                 switch ( count( $collectedHits ) ) {
 211                                         case 0:
 212                                                 return false;
 213                                         case 1:
 214                                                 return array_shift( $collectedHits );
 215                                         default:
 216                                                 return false;
 217                                 }
 218                         }
 219                 }
 220                 // should never be here
 221                 return false;
 222         }
 223
 224 }