Localisation updates from https://translatewiki.net.
[mediawiki.git] / includes / site / MediaWikiPageNameNormalizer.php
blobaff7710cb4fe9d159899d87b9fb1915f72b6faa1
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
18 * @file
21 namespace MediaWiki\Site;
23 use InvalidArgumentException;
24 use MediaWiki\Http\HttpRequestFactory;
25 use MediaWiki\Json\FormatJson;
26 use MediaWiki\MediaWikiServices;
27 use UtfNormal\Validator;
29 /**
30 * Service for normalizing a page name via a MediaWiki action API.
32 * @since 1.27
33 * @author John Erling Blad < jeblad@gmail.com >
34 * @author Daniel Kinzler
35 * @author Jeroen De Dauw < jeroendedauw@gmail.com >
36 * @author Marius Hoch
38 class MediaWikiPageNameNormalizer {
40 public const FOLLOW_REDIRECT = 1;
41 public const NOFOLLOW_REDIRECT = 2;
43 /**
44 * @var HttpRequestFactory
46 private $httpRequestFactory;
48 /**
49 * @param HttpRequestFactory|null $httpRequestFactory
51 public function __construct( $httpRequestFactory = null ) {
52 if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
53 $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
55 $this->httpRequestFactory = $httpRequestFactory;
58 /**
59 * Returns the normalized form of the given page title, using the
60 * normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default)
61 * and the given title is a redirect, the redirect will be resolved and
62 * the redirect target is returned.
63 * Only titles of existing pages will be returned.
65 * @note This actually makes an API request to the remote site, so beware
66 * that this function is slow and depends on an external service.
68 * @see Site::normalizePageName
70 * @since 1.27
71 * @since 1.37 Added $followRedirect
73 * @param string $pageName
74 * @param string $apiUrl
75 * @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT
77 * @return string|false The normalized form of the title,
78 * or false to indicate an invalid title, a missing page,
79 * or some other kind of error.
81 public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
82 if ( $followRedirect === self::FOLLOW_REDIRECT ) {
83 $redirects = true;
84 } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
85 $redirects = false;
86 } else {
87 throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
90 // Go on call the external site
92 // Make sure the string is normalized into NFC (due to T42017)
93 // but do nothing to the whitespaces, that should work appropriately.
94 // @see https://phabricator.wikimedia.org/T42017
95 $pageName = Validator::cleanUp( $pageName );
97 // Build the args for the specific call
98 $args = [
99 'action' => 'query',
100 'prop' => 'info',
101 'redirects' => $redirects,
102 'converttitles' => true,
103 'format' => 'json',
104 'titles' => $pageName,
105 // @todo options for maxlag and maxage
106 // Note that maxlag will lead to a long delay before a reply is made,
107 // but that maxage can avoid the extreme delay. On the other hand
108 // maxage could be nice to use anyhow as it stops unnecessary requests.
109 // Also consider smaxage if maxage is used.
112 $url = wfAppendQuery( $apiUrl, $args );
114 // Go on call the external site
115 // @todo we need a good way to specify a timeout here.
116 $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
118 if ( $ret === null ) {
119 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
120 return false;
123 $data = FormatJson::decode( $ret, true );
125 if ( !is_array( $data ) ) {
126 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
127 return false;
130 $page = static::extractPageRecord( $data, $pageName );
132 if ( isset( $page['missing'] ) ) {
133 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
134 . $ret );
135 return false;
138 if ( isset( $page['invalid'] ) ) {
139 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
140 . $ret );
141 return false;
144 if ( !isset( $page['title'] ) ) {
145 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
146 return false;
149 return $page['title'];
153 * Get normalization record for a given page title from an API response.
155 * @param array $externalData A reply from the API on a external server.
156 * @param string $pageTitle Identifies the page at the external site, needing normalization.
158 * @return array|bool A 'page' structure representing the page identified by $pageTitle.
160 private static function extractPageRecord( $externalData, $pageTitle ) {
161 // If there is a special case with only one returned page
162 // we can cheat, and only return
163 // the single page in the "pages" substructure.
164 if ( isset( $externalData['query']['pages'] ) ) {
165 $pages = array_values( $externalData['query']['pages'] );
166 if ( count( $pages ) === 1 ) {
167 return $pages[0];
170 // This is only used during internal testing, as it is assumed
171 // a more optimal (and lossfree) storage.
172 // Make initial checks and return if prerequisites are not meet.
173 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
174 return false;
176 // Loop over the tree different named structures, that otherwise are similar
177 $structs = [
178 'normalized' => 'from',
179 'converted' => 'from',
180 'redirects' => 'from',
181 'pages' => 'title'
183 foreach ( $structs as $listId => $fieldId ) {
184 // Check if the substructure exist at all.
185 if ( !isset( $externalData['query'][$listId] ) ) {
186 continue;
188 // Filter the substructure down to what we actually are using.
189 $collectedHits = array_filter(
190 array_values( $externalData['query'][$listId] ),
191 static function ( $a ) use ( $fieldId, $pageTitle ) {
192 return $a[$fieldId] === $pageTitle;
195 // If still looping over normalization, conversion or redirects,
196 // then we need to keep the new page title for later rounds.
197 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
198 switch ( count( $collectedHits ) ) {
199 case 0:
200 break;
201 case 1:
202 $pageTitle = $collectedHits[0]['to'];
203 break;
204 default:
205 return false;
207 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
208 // If on the pages structure we should prepare for returning.
210 switch ( count( $collectedHits ) ) {
211 case 0:
212 return false;
213 case 1:
214 return array_shift( $collectedHits );
215 default:
216 return false;
220 // should never be here
221 return false;