includes/site/MediaWikiSite.php

   1 <?php
   2 /**
   3  * Class representing a MediaWiki site.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Site
  22  * @license GNU GPL v2+
  23  * @author John Erling Blad < jeblad@gmail.com >
  24  * @author Daniel Kinzler
  25  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  26  */
  27
  28 /**
  29  * Class representing a MediaWiki site.
  30  *
  31  * @since 1.21
  32  *
  33  * @ingroup Site
  34  */
  35 class MediaWikiSite extends Site {
  36
  37         const PATH_FILE = 'file_path';
  38         const PATH_PAGE = 'page_path';
  39
  40         /**
  41          * @since 1.21
  42          * @deprecated Just use the constructor or the factory Site::newForType
  43          *
  44          * @param integer $globalId
  45          *
  46          * @return MediaWikiSite
  47          */
  48         public static function newFromGlobalId( $globalId ) {
  49                 $site = new static();
  50                 $site->setGlobalId( $globalId );
  51                 return $site;
  52         }
  53
  54         /**
  55          * Constructor.
  56          *
  57          * @since 1.21
  58          *
  59          * @param string $type
  60          */
  61         public function __construct( $type = self::TYPE_MEDIAWIKI ) {
  62                 parent::__construct( $type );
  63         }
  64
  65         /**
  66          * Returns the database form of the given title.
  67          *
  68          * @since 1.21
  69          *
  70          * @param string $title the target page's title, in normalized form.
  71          *
  72          * @return String
  73          */
  74         public function toDBKey( $title ) {
  75                 return str_replace( ' ', '_', $title );
  76         }
  77
  78         /**
  79          * Returns the normalized form of the given page title, using the normalization rules of the given site.
  80          * If the given title is a redirect, the redirect weill be resolved and the redirect target is returned.
  81          *
  82          * @note  : This actually makes an API request to the remote site, so beware that this function is slow and depends
  83          *          on an external service.
  84          *
  85          * @note  : If MW_PHPUNIT_TEST is defined, the call to the external site is skipped, and the title
  86          *          is normalized using the local normalization rules as implemented by the Title class.
  87          *
  88          * @see Site::normalizePageName
  89          *
  90          * @since 1.21
  91          *
  92          * @param string $pageName
  93          *
  94          * @return string
  95          * @throws MWException
  96          */
  97         public function normalizePageName( $pageName ) {
  98
  99                 // Check if we have strings as arguments.
 100                 if ( !is_string( $pageName ) ) {
 101                         throw new MWException( '$pageName must be a string' );
 102                 }
 103
 104                 // Go on call the external site
 105                 if ( defined( 'MW_PHPUNIT_TEST' ) ) {
 106                         // If the code is under test, don't call out to other sites, just normalize locally.
 107                         // Note: this may cause results to be inconsistent with the actual normalization used by the respective remote site!
 108
 109                         $t = Title::newFromText( $pageName );
 110                         return $t->getPrefixedText();
 111                 } else {
 112
 113                         // Make sure the string is normalized into NFC (due to the bug 40017)
 114                         // but do nothing to the whitespaces, that should work appropriately.
 115                         // @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
 116                         $pageName = UtfNormal::cleanUp( $pageName );
 117
 118                         // Build the args for the specific call
 119                         $args = array(
 120                                 'action' => 'query',
 121                                 'prop' => 'info',
 122                                 'redirects' => true,
 123                                 'converttitles' => true,
 124                                 'format' => 'json',
 125                                 'titles' => $pageName,
 126                                 // @todo options for maxlag and maxage
 127                                 // Note that maxlag will lead to a long delay before a reply is made,
 128                                 // but that maxage can avoid the extreme delay. On the other hand
 129                                 // maxage could be nice to use anyhow as it stops unnecessary requests.
 130                                 // Also consider smaxage if maxage is used.
 131                         );
 132
 133                         $url = wfAppendQuery( $this->getFileUrl( 'api.php' ), $args );
 134
 135                         // Go on call the external site
 136                         // @todo we need a good way to specify a timeout here.
 137                         $ret = Http::get( $url );
 138                 }
 139
 140                 if ( $ret === false ) {
 141                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 142                         return false;
 143                 }
 144
 145                 $data = FormatJson::decode( $ret, true );
 146
 147                 if ( !is_array( $data ) ) {
 148                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 149                         return false;
 150                 }
 151
 152                 $page = static::extractPageRecord( $data, $pageName );
 153
 154                 if ( isset( $page['missing'] ) ) {
 155                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! " . $ret );
 156                         return false;
 157                 }
 158
 159                 if ( isset( $page['invalid'] ) ) {
 160                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! " . $ret );
 161                         return false;
 162                 }
 163
 164                 if ( !isset( $page['title'] ) ) {
 165                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 166                         return false;
 167                 }
 168
 169                 return $page['title'];
 170         }
 171
 172         /**
 173          * Get normalization record for a given page title from an API response.
 174          *
 175          * @since 1.21
 176          *
 177          * @param array $externalData A reply from the API on a external server.
 178          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 179          *
 180          * @return array|boolean a 'page' structure representing the page identified by $pageTitle.
 181          */
 182         private static function extractPageRecord( $externalData, $pageTitle ) {
 183                 // If there is a special case with only one returned page
 184                 // we can cheat, and only return
 185                 // the single page in the "pages" substructure.
 186                 if ( isset( $externalData['query']['pages'] ) ) {
 187                         $pages = array_values( $externalData['query']['pages'] );
 188                         if ( count( $pages ) === 1 ) {
 189                                 return $pages[0];
 190                         }
 191                 }
 192                 // This is only used during internal testing, as it is assumed
 193                 // a more optimal (and lossfree) storage.
 194                 // Make initial checks and return if prerequisites are not meet.
 195                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 196                         return false;
 197                 }
 198                 // Loop over the tree different named structures, that otherwise are similar
 199                 $structs = array(
 200                         'normalized' => 'from',
 201                         'converted' => 'from',
 202                         'redirects' => 'from',
 203                         'pages' => 'title'
 204                 );
 205                 foreach ( $structs as $listId => $fieldId ) {
 206                         // Check if the substructure exist at all.
 207                         if ( !isset( $externalData['query'][$listId] ) ) {
 208                                 continue;
 209                         }
 210                         // Filter the substructure down to what we actually are using.
 211                         $collectedHits = array_filter(
 212                                 array_values( $externalData['query'][$listId] ),
 213                                 function( $a ) use ( $fieldId, $pageTitle ) {
 214                                         return $a[$fieldId] === $pageTitle;
 215                                 }
 216                         );
 217                         // If still looping over normalization, conversion or redirects,
 218                         // then we need to keep the new page title for later rounds.
 219                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 220                                 switch ( count( $collectedHits ) ) {
 221                                         case 0:
 222                                                 break;
 223                                         case 1:
 224                                                 $pageTitle = $collectedHits[0]['to'];
 225                                                 break;
 226                                         default:
 227                                                 return false;
 228                                 }
 229                         }
 230                         // If on the pages structure we should prepare for returning.
 231                         elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 232                                 switch ( count( $collectedHits ) ) {
 233                                         case 0:
 234                                                 return false;
 235                                         case 1:
 236                                                 return array_shift( $collectedHits );
 237                                         default:
 238                                                 return false;
 239                                 }
 240                         }
 241                 }
 242                 // should never be here
 243                 return false;
 244         }
 245
 246         /**
 247          * @see Site::getLinkPathType
 248          * Returns Site::PATH_PAGE
 249          *
 250          * @since 1.21
 251          *
 252          * @return string
 253          */
 254         public function getLinkPathType() {
 255                 return self::PATH_PAGE;
 256         }
 257
 258         /**
 259          * Returns the relative page path.
 260          *
 261          * @since 1.21
 262          *
 263          * @return string
 264          */
 265         public function getRelativePagePath() {
 266                 return parse_url( $this->getPath( self::PATH_PAGE ), PHP_URL_PATH );
 267         }
 268
 269         /**
 270          * Returns the relative file path.
 271          *
 272          * @since 1.21
 273          *
 274          * @return string
 275          */
 276         public function getRelativeFilePath() {
 277                 return parse_url( $this->getPath( self::PATH_FILE ), PHP_URL_PATH );
 278         }
 279
 280         /**
 281          * Sets the relative page path.
 282          *
 283          * @since 1.21
 284          *
 285          * @param string $path
 286          */
 287         public function setPagePath( $path ) {
 288                 $this->setPath( self::PATH_PAGE, $path );
 289         }
 290
 291         /**
 292          * Sets the relative file path.
 293          *
 294          * @since 1.21
 295          *
 296          * @param string $path
 297          */
 298         public function setFilePath( $path ) {
 299                 $this->setPath( self::PATH_FILE, $path );
 300         }
 301
 302         /**
 303          * @see Site::getPageUrl
 304          *
 305          * This implementation returns a URL constructed using the path returned by getLinkPath().
 306          * In addition to the default behavior implemented by Site::getPageUrl(), this
 307          * method converts the $pageName to DBKey-format by replacing spaces with underscores
 308          * before using it in the URL.
 309          *
 310          * @since 1.21
 311          *
 312          * @param string|boolean $pageName Page name or false (default: false)
 313          *
 314          * @return string
 315          */
 316         public function getPageUrl( $pageName = false ) {
 317                 $url = $this->getLinkPath();
 318
 319                 if ( $url === false ) {
 320                         return false;
 321                 }
 322
 323                 if ( $pageName !== false ) {
 324                         $pageName = $this->toDBKey( trim( $pageName ) );
 325                         $url = str_replace( '$1', wfUrlencode( $pageName ), $url );
 326                 }
 327
 328                 return $url;
 329         }
 330
 331         /**
 332          * Returns the full file path (ie site url + relative file path).
 333          * The path should go at the $1 marker. If the $path
 334          * argument is provided, the marker will be replaced by it's value.
 335          *
 336          * @since 1.21
 337          *
 338          * @param string|boolean $path
 339          *
 340          * @return string
 341          */
 342         public function getFileUrl( $path = false ) {
 343                 $filePath = $this->getPath( self::PATH_FILE );
 344
 345                 if ( $filePath !== false ) {
 346                         $filePath = str_replace( '$1', $path, $filePath );
 347                 }
 348
 349                 return $filePath;
 350         }
 351
 352 }