Run generateLocalAutoload.php
[mediawiki.git] / includes / site / MediaWikiPageNameNormalizer.php
blob2f7173677906ff749370ca7cb45a6dcf538cc446
1 <?php
3 namespace MediaWiki\Site;
5 use FormatJson;
6 use Http;
7 use UtfNormal\Validator;
9 /**
10 * Service for normalizing a page name using a MediaWiki api.
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 * http://www.gnu.org/copyleft/gpl.html
27 * @since 1.27
29 * @license GNU GPL v2+
30 * @author John Erling Blad < jeblad@gmail.com >
31 * @author Daniel Kinzler
32 * @author Jeroen De Dauw < jeroendedauw@gmail.com >
33 * @author Marius Hoch
35 class MediaWikiPageNameNormalizer {
37 /**
38 * Returns the normalized form of the given page title, using the
39 * normalization rules of the given site. If the given title is a redirect,
40 * the redirect weill be resolved and the redirect target is returned.
42 * @note This actually makes an API request to the remote site, so beware
43 * that this function is slow and depends on an external service.
45 * @see Site::normalizePageName
47 * @since 1.27
49 * @param string $pageName
50 * @param string $apiUrl
52 * @return string
53 * @throws \MWException
55 public function normalizePageName( $pageName, $apiUrl ) {
57 // Check if we have strings as arguments.
58 if ( !is_string( $pageName ) ) {
59 throw new \MWException( '$pageName must be a string' );
62 // Go on call the external site
64 // Make sure the string is normalized into NFC (due to T42017)
65 // but do nothing to the whitespaces, that should work appropriately.
66 // @see https://phabricator.wikimedia.org/T42017
67 $pageName = Validator::cleanUp( $pageName );
69 // Build the args for the specific call
70 $args = [
71 'action' => 'query',
72 'prop' => 'info',
73 'redirects' => true,
74 'converttitles' => true,
75 'format' => 'json',
76 'titles' => $pageName,
77 // @todo options for maxlag and maxage
78 // Note that maxlag will lead to a long delay before a reply is made,
79 // but that maxage can avoid the extreme delay. On the other hand
80 // maxage could be nice to use anyhow as it stops unnecessary requests.
81 // Also consider smaxage if maxage is used.
84 $url = wfAppendQuery( $apiUrl, $args );
86 // Go on call the external site
87 // @todo we need a good way to specify a timeout here.
88 $ret = Http::get( $url, [], __METHOD__ );
90 if ( $ret === false ) {
91 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
92 return false;
95 $data = FormatJson::decode( $ret, true );
97 if ( !is_array( $data ) ) {
98 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
99 return false;
102 $page = static::extractPageRecord( $data, $pageName );
104 if ( isset( $page['missing'] ) ) {
105 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
106 . $ret );
107 return false;
110 if ( isset( $page['invalid'] ) ) {
111 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
112 . $ret );
113 return false;
116 if ( !isset( $page['title'] ) ) {
117 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
118 return false;
121 return $page['title'];
125 * Get normalization record for a given page title from an API response.
127 * @param array $externalData A reply from the API on a external server.
128 * @param string $pageTitle Identifies the page at the external site, needing normalization.
130 * @return array|bool A 'page' structure representing the page identified by $pageTitle.
132 private static function extractPageRecord( $externalData, $pageTitle ) {
133 // If there is a special case with only one returned page
134 // we can cheat, and only return
135 // the single page in the "pages" substructure.
136 if ( isset( $externalData['query']['pages'] ) ) {
137 $pages = array_values( $externalData['query']['pages'] );
138 if ( count( $pages ) === 1 ) {
139 return $pages[0];
142 // This is only used during internal testing, as it is assumed
143 // a more optimal (and lossfree) storage.
144 // Make initial checks and return if prerequisites are not meet.
145 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
146 return false;
148 // Loop over the tree different named structures, that otherwise are similar
149 $structs = [
150 'normalized' => 'from',
151 'converted' => 'from',
152 'redirects' => 'from',
153 'pages' => 'title'
155 foreach ( $structs as $listId => $fieldId ) {
156 // Check if the substructure exist at all.
157 if ( !isset( $externalData['query'][$listId] ) ) {
158 continue;
160 // Filter the substructure down to what we actually are using.
161 $collectedHits = array_filter(
162 array_values( $externalData['query'][$listId] ),
163 function ( $a ) use ( $fieldId, $pageTitle ) {
164 return $a[$fieldId] === $pageTitle;
167 // If still looping over normalization, conversion or redirects,
168 // then we need to keep the new page title for later rounds.
169 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
170 switch ( count( $collectedHits ) ) {
171 case 0:
172 break;
173 case 1:
174 $pageTitle = $collectedHits[0]['to'];
175 break;
176 default:
177 return false;
179 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
180 // If on the pages structure we should prepare for returning.
182 switch ( count( $collectedHits ) ) {
183 case 0:
184 return false;
185 case 1:
186 return array_shift( $collectedHits );
187 default:
188 return false;
192 // should never be here
193 return false;