includes/site/MediaWikiSite.php

   1 <?php
   2 /**
   3  * Class representing a MediaWiki site.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Site
  22  * @license GNU GPL v2+
  23  * @author John Erling Blad < jeblad@gmail.com >
  24  * @author Daniel Kinzler
  25  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  26  */
  27
  28 /**
  29  * Class representing a MediaWiki site.
  30  *
  31  * @since 1.21
  32  *
  33  * @ingroup Site
  34  */
  35 class MediaWikiSite extends SiteObject {
  36
  37         const PATH_FILE = 'file_path';
  38         const PATH_PAGE = 'page_path';
  39
  40         /**
  41          * @since 1.21
  42          *
  43          * @param integer $globalId
  44          *
  45          * @return MediaWikiSite
  46          */
  47         public static function newFromGlobalId( $globalId ) {
  48                 return SitesTable::singleton()->newRow( array(
  49                         'type' => Site::TYPE_MEDIAWIKI,
  50                         'global_key' => $globalId,
  51                 ), true );
  52         }
  53
  54         /**
  55          * Returns the database form of the given title.
  56          *
  57          * @since 1.21
  58          *
  59          * @param String $title the target page's title, in normalized form.
  60          *
  61          * @return String
  62          */
  63         public function toDBKey( $title ) {
  64                 return str_replace( ' ', '_', $title );
  65         }
  66
  67         /**
  68          * Returns the normalized form of the given page title, using the normalization rules of the given site.
  69          * If the given title is a redirect, the redirect weill be resolved and the redirect target is returned.
  70          *
  71          * @note  : This actually makes an API request to the remote site, so beware that this function is slow and depends
  72          *          on an external service.
  73          *
  74          * @note  : If MW_PHPUNIT_TEST is defined, the call to the external site is skipped, and the title
  75          *          is normalized using the local normalization rules as implemented by the Title class.
  76          *
  77          * @see Site::normalizePageName
  78          *
  79          * @since 1.21
  80          *
  81          * @param string $pageName
  82          *
  83          * @return string
  84          * @throws MWException
  85          */
  86         public function normalizePageName( $pageName ) {
  87
  88                 // Check if we have strings as arguments.
  89                 if ( !is_string( $pageName ) ) {
  90                         throw new MWException( '$pageName must be a string' );
  91                 }
  92
  93                 // Go on call the external site
  94                 if ( defined( 'MW_PHPUNIT_TEST' ) ) {
  95                         // If the code is under test, don't call out to other sites, just normalize locally.
  96                         // Note: this may cause results to be inconsistent with the actual normalization used by the respective remote site!
  97
  98                         $t = Title::newFromText( $pageName );
  99                         return $t->getPrefixedText();
 100                 } else {
 101
 102                         // Make sure the string is normalized into NFC (due to the bug 40017)
 103                         // but do nothing to the whitespaces, that should work appropriately.
 104                         // @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
 105                         $pageName = UtfNormal::cleanUp( $pageName );
 106
 107                         // Build the args for the specific call
 108                         $args = array(
 109                                 'action' => 'query',
 110                                 'prop' => 'info',
 111                                 'redirects' => true,
 112                                 'converttitles' => true,
 113                                 'format' => 'json',
 114                                 'titles' => $pageName,
 115                                 //@todo: options for maxlag and maxage
 116                                 // Note that maxlag will lead to a long delay before a reply is made,
 117                                 // but that maxage can avoid the extreme delay. On the other hand
 118                                 // maxage could be nice to use anyhow as it stops unnecessary requests.
 119                                 // Also consider smaxage if maxage is used.
 120                         );
 121
 122                         $url = $this->getFileUrl( 'api.php' ) . '?' . wfArrayToCgi( $args );
 123
 124                         // Go on call the external site
 125                         //@todo: we need a good way to specify a timeout here.
 126                         $ret = Http::get( $url );
 127                 }
 128
 129                 if ( $ret === false ) {
 130                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 131                         return false;
 132                 }
 133
 134                 $data = FormatJson::decode( $ret, true );
 135
 136                 if ( !is_array( $data ) ) {
 137                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 138                         return false;
 139                 }
 140
 141                 $page = static::extractPageRecord( $data, $pageName );
 142
 143                 if ( isset( $page['missing'] ) ) {
 144                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! " . $ret );
 145                         return false;
 146                 }
 147
 148                 if ( isset( $page['invalid'] ) ) {
 149                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! " . $ret );
 150                         return false;
 151                 }
 152
 153                 if ( !isset( $page['title'] ) ) {
 154                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 155                         return false;
 156                 }
 157
 158                 return $page['title'];
 159         }
 160
 161
 162         /**
 163          * Get normalization record for a given page title from an API response.
 164          *
 165          * @since 1.21
 166          *
 167          * @param array $externalData A reply from the API on a external server.
 168          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 169          *
 170          * @return array|false a 'page' structure representing the page identified by $pageTitle.
 171          */
 172         private static function extractPageRecord( $externalData, $pageTitle ) {
 173                 // If there is a special case with only one returned page
 174                 // we can cheat, and only return
 175                 // the single page in the "pages" substructure.
 176                 if ( isset( $externalData['query']['pages'] ) ) {
 177                         $pages = array_values( $externalData['query']['pages'] );
 178                         if ( count( $pages) === 1 ) {
 179                                 return $pages[0];
 180                         }
 181                 }
 182                 // This is only used during internal testing, as it is assumed
 183                 // a more optimal (and lossfree) storage.
 184                 // Make initial checks and return if prerequisites are not meet.
 185                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 186                         return false;
 187                 }
 188                 // Loop over the tree different named structures, that otherwise are similar
 189                 $structs = array(
 190                         'normalized' => 'from',
 191                         'converted' => 'from',
 192                         'redirects' => 'from',
 193                         'pages' => 'title'
 194                 );
 195                 foreach ( $structs as $listId => $fieldId ) {
 196                         // Check if the substructure exist at all.
 197                         if ( !isset( $externalData['query'][$listId] ) ) {
 198                                 continue;
 199                         }
 200                         // Filter the substructure down to what we actually are using.
 201                         $collectedHits = array_filter(
 202                                 array_values( $externalData['query'][$listId] ),
 203                                 function( $a ) use ( $fieldId, $pageTitle ) {
 204                                         return $a[$fieldId] === $pageTitle;
 205                                 }
 206                         );
 207                         // If still looping over normalization, conversion or redirects,
 208                         // then we need to keep the new page title for later rounds.
 209                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 210                                 switch ( count( $collectedHits ) ) {
 211                                         case 0:
 212                                                 break;
 213                                         case 1:
 214                                                 $pageTitle = $collectedHits[0]['to'];
 215                                                 break;
 216                                         default:
 217                                                 return false;
 218                                 }
 219                         }
 220                         // If on the pages structure we should prepare for returning.
 221                         elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 222                                 switch ( count( $collectedHits ) ) {
 223                                         case 0:
 224                                                 return false;
 225                                         case 1:
 226                                                 return array_shift( $collectedHits );
 227                                         default:
 228                                                 return false;
 229                                 }
 230                         }
 231                 }
 232                 // should never be here
 233                 return false;
 234         }
 235
 236         /**
 237          * @see Site::getLinkPathType
 238          * Returns Site::PATH_PAGE
 239          *
 240          * @since 1.21
 241          *
 242          * @return string
 243          */
 244         public function getLinkPathType() {
 245                 return self::PATH_PAGE;
 246         }
 247
 248         /**
 249          * Returns the relative page path.
 250          *
 251          * @since 1.21
 252          *
 253          * @return string
 254          */
 255         public function getRelativePagePath() {
 256                 return parse_url( $this->getPath( self::PATH_PAGE ), PHP_URL_PATH );
 257         }
 258
 259         /**
 260          * Returns the relative file path.
 261          *
 262          * @since 1.21
 263          *
 264          * @return string
 265          */
 266         public function getRelativeFilePath() {
 267                 return parse_url( $this->getPath( self::PATH_FILE ), PHP_URL_PATH );
 268         }
 269
 270         /**
 271          * Sets the relative page path.
 272          *
 273          * @since 1.21
 274          *
 275          * @param string $path
 276          */
 277         public function setPagePath( $path ) {
 278                 $this->setPath( self::PATH_PAGE, $path );
 279         }
 280
 281         /**
 282          * Sets the relative file path.
 283          *
 284          * @since 1.21
 285          *
 286          * @param string $path
 287          */
 288         public function setFilePath( $path ) {
 289                 $this->setPath( self::PATH_FILE, $path );
 290         }
 291
 292         /**
 293          * @see Site::getPagePath
 294          *
 295          * This implementation returns a URL constructed using the path returned by getLinkPath().
 296          * In addition to the default behaviour implemented by SiteObject::getPageUrl(), this
 297          * method converts the $pageName to DBKey-format by replacing spaces with underscores
 298          * before using it in the URL.
 299          *
 300          * @since 1.21
 301          *
 302          * @param $pagename string: Page name (default: false)
 303          *
 304          * @return string
 305          */
 306         public function getPageUrl( $pageName = false ) {
 307                 $url = $this->getLinkPath();
 308
 309                 if ( $url === false ) {
 310                         return false;
 311                 }
 312
 313                 if ( $pageName !== false ) {
 314                         $pageName = $this->toDBKey( trim( $pageName ) );
 315                         $url = str_replace( '$1', wfUrlencode( $pageName ), $url ) ;
 316                 }
 317
 318                 return $url;
 319         }
 320
 321         /**
 322          * Returns the full file path (ie site url + relative file path).
 323          * The path should go at the $1 marker. If the $path
 324          * argument is provided, the marker will be replaced by it's value.
 325          *
 326          * @since 1.21
 327          *
 328          * @param string|false $path
 329          *
 330          * @return string
 331          */
 332         public function getFileUrl( $path = false ) {
 333                 $filePath = $this->getPath( self::PATH_FILE );
 334
 335                 if ( $filePath !== false ) {
 336                         $filePath = str_replace( '$1', $path, $filePath );
 337                 }
 338
 339                 return $filePath;
 340         }
 341
 342 }