3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
21 namespace MediaWiki\Site
;
23 use InvalidArgumentException
;
24 use MediaWiki\Http\HttpRequestFactory
;
25 use MediaWiki\Json\FormatJson
;
26 use MediaWiki\MediaWikiServices
;
27 use UtfNormal\Validator
;
30 * Service for normalizing a page name via a MediaWiki action API.
33 * @author John Erling Blad < jeblad@gmail.com >
34 * @author Daniel Kinzler
35 * @author Jeroen De Dauw < jeroendedauw@gmail.com >
38 class MediaWikiPageNameNormalizer
{
40 public const FOLLOW_REDIRECT
= 1;
41 public const NOFOLLOW_REDIRECT
= 2;
44 * @var HttpRequestFactory
46 private $httpRequestFactory;
49 * @param HttpRequestFactory|null $httpRequestFactory
51 public function __construct( $httpRequestFactory = null ) {
52 if ( !$httpRequestFactory instanceof HttpRequestFactory
) {
53 $httpRequestFactory = MediaWikiServices
::getInstance()->getHttpRequestFactory();
55 $this->httpRequestFactory
= $httpRequestFactory;
59 * Returns the normalized form of the given page title, using the
60 * normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default)
61 * and the given title is a redirect, the redirect will be resolved and
62 * the redirect target is returned.
63 * Only titles of existing pages will be returned.
65 * @note This actually makes an API request to the remote site, so beware
66 * that this function is slow and depends on an external service.
68 * @see Site::normalizePageName
71 * @since 1.37 Added $followRedirect
73 * @param string $pageName
74 * @param string $apiUrl
75 * @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT
77 * @return string|false The normalized form of the title,
78 * or false to indicate an invalid title, a missing page,
79 * or some other kind of error.
81 public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self
::FOLLOW_REDIRECT
) {
82 if ( $followRedirect === self
::FOLLOW_REDIRECT
) {
84 } elseif ( $followRedirect === self
::NOFOLLOW_REDIRECT
) {
87 throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
90 // Go on call the external site
92 // Make sure the string is normalized into NFC (due to T42017)
93 // but do nothing to the whitespaces, that should work appropriately.
94 // @see https://phabricator.wikimedia.org/T42017
95 $pageName = Validator
::cleanUp( $pageName );
97 // Build the args for the specific call
101 'redirects' => $redirects,
102 'converttitles' => true,
104 'titles' => $pageName,
105 // @todo options for maxlag and maxage
106 // Note that maxlag will lead to a long delay before a reply is made,
107 // but that maxage can avoid the extreme delay. On the other hand
108 // maxage could be nice to use anyhow as it stops unnecessary requests.
109 // Also consider smaxage if maxage is used.
112 $url = wfAppendQuery( $apiUrl, $args );
114 // Go on call the external site
115 // @todo we need a good way to specify a timeout here.
116 $ret = $this->httpRequestFactory
->get( $url, [], __METHOD__
);
118 if ( $ret === null ) {
119 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
123 $data = FormatJson
::decode( $ret, true );
125 if ( !is_array( $data ) ) {
126 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
130 $page = static::extractPageRecord( $data, $pageName );
132 if ( isset( $page['missing'] ) ) {
133 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
138 if ( isset( $page['invalid'] ) ) {
139 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
144 if ( !isset( $page['title'] ) ) {
145 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
149 return $page['title'];
153 * Get normalization record for a given page title from an API response.
155 * @param array $externalData A reply from the API on a external server.
156 * @param string $pageTitle Identifies the page at the external site, needing normalization.
158 * @return array|bool A 'page' structure representing the page identified by $pageTitle.
160 private static function extractPageRecord( $externalData, $pageTitle ) {
161 // If there is a special case with only one returned page
162 // we can cheat, and only return
163 // the single page in the "pages" substructure.
164 if ( isset( $externalData['query']['pages'] ) ) {
165 $pages = array_values( $externalData['query']['pages'] );
166 if ( count( $pages ) === 1 ) {
170 // This is only used during internal testing, as it is assumed
171 // a more optimal (and lossfree) storage.
172 // Make initial checks and return if prerequisites are not meet.
173 if ( !is_array( $externalData ) ||
!isset( $externalData['query'] ) ) {
176 // Loop over the tree different named structures, that otherwise are similar
178 'normalized' => 'from',
179 'converted' => 'from',
180 'redirects' => 'from',
183 foreach ( $structs as $listId => $fieldId ) {
184 // Check if the substructure exist at all.
185 if ( !isset( $externalData['query'][$listId] ) ) {
188 // Filter the substructure down to what we actually are using.
189 $collectedHits = array_filter(
190 array_values( $externalData['query'][$listId] ),
191 static function ( $a ) use ( $fieldId, $pageTitle ) {
192 return $a[$fieldId] === $pageTitle;
195 // If still looping over normalization, conversion or redirects,
196 // then we need to keep the new page title for later rounds.
197 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
198 switch ( count( $collectedHits ) ) {
202 $pageTitle = $collectedHits[0]['to'];
207 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
208 // If on the pages structure we should prepare for returning.
210 switch ( count( $collectedHits ) ) {
214 return array_shift( $collectedHits );
220 // should never be here