Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / browser / components / feeds / src / nsFeedSniffer.cpp
blob19aa0fb2f01ab7c1d55d4a1c660ee6755ba241f0
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is the Feed Content Sniffer.
17 * The Initial Developer of the Original Code is Google Inc.
18 * Portions created by the Initial Developer are Copyright (C) 2006
19 * the Initial Developer. All Rights Reserved.
21 * Contributor(s):
22 * Ben Goodger <beng@google.com>
23 * Robert Sayre <sayrer@gmail.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #include "nsFeedSniffer.h"
41 #include "prmem.h"
43 #include "nsNetCID.h"
44 #include "nsXPCOM.h"
45 #include "nsCOMPtr.h"
46 #include "nsStringStream.h"
48 #include "nsBrowserCompsCID.h"
50 #include "nsICategoryManager.h"
51 #include "nsIServiceManager.h"
52 #include "nsComponentManagerUtils.h"
53 #include "nsServiceManagerUtils.h"
55 #include "nsIStreamConverterService.h"
56 #include "nsIStreamConverter.h"
58 #include "nsIStreamListener.h"
60 #include "nsIHttpChannel.h"
61 #include "nsIMIMEHeaderParam.h"
63 #include "nsMimeTypes.h"
65 #define TYPE_ATOM "application/atom+xml"
66 #define TYPE_RSS "application/rss+xml"
67 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
69 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
70 #define NS_RSS "http://purl.org/rss/1.0/"
72 #define MAX_BYTES 512
74 NS_IMPL_ISUPPORTS3(nsFeedSniffer,
75 nsIContentSniffer,
76 nsIStreamListener,
77 nsIRequestObserver)
79 nsresult
80 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
81 const PRUint8* data,
82 PRUint32 length)
84 nsresult rv = NS_OK;
86 mDecodedData = "";
87 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
88 if (!httpChannel)
89 return NS_ERROR_NO_INTERFACE;
91 nsCAutoString contentEncoding;
92 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
93 contentEncoding);
94 if (!contentEncoding.IsEmpty()) {
95 nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
96 if (converterService) {
97 ToLowerCase(contentEncoding);
99 nsCOMPtr<nsIStreamListener> converter;
100 rv = converterService->AsyncConvertData(contentEncoding.get(),
101 "uncompressed", this, nsnull,
102 getter_AddRefs(converter));
103 NS_ENSURE_SUCCESS(rv, rv);
105 converter->OnStartRequest(request, nsnull);
107 nsCOMPtr<nsIStringInputStream> rawStream =
108 do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
109 if (!rawStream)
110 return NS_ERROR_FAILURE;
112 rv = rawStream->SetData((const char*)data, length);
113 NS_ENSURE_SUCCESS(rv, rv);
115 rv = converter->OnDataAvailable(request, nsnull, rawStream, 0, length);
116 NS_ENSURE_SUCCESS(rv, rv);
118 converter->OnStopRequest(request, nsnull, NS_OK);
121 return rv;
124 template<int N>
125 static PRBool
126 StringBeginsWithLowercaseLiteral(nsAString& aString,
127 const char (&aSubstring)[N])
129 return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
132 // XXXsayrer put this in here to get on the branch with minimal delay.
133 // Trunk really needs to factor this out. This is the third usage.
134 PRBool
135 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
137 if (!httpChannel)
138 return PR_FALSE;
140 nsCAutoString contentDisposition;
141 nsresult rv =
142 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("content-disposition"),
143 contentDisposition);
145 if (NS_SUCCEEDED(rv) && !contentDisposition.IsEmpty()) {
146 nsCOMPtr<nsIURI> uri;
147 httpChannel->GetURI(getter_AddRefs(uri));
148 nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar =
149 do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
150 if (NS_SUCCEEDED(rv))
152 nsCAutoString fallbackCharset;
153 if (uri)
154 uri->GetOriginCharset(fallbackCharset);
155 nsAutoString dispToken;
156 // Get the disposition type
157 rv = mimehdrpar->GetParameter(contentDisposition, "", fallbackCharset,
158 PR_TRUE, nsnull, dispToken);
159 // RFC 2183, section 2.8 says that an unknown disposition
160 // value should be treated as "attachment"
161 // XXXbz this code is duplicated in GetFilenameAndExtensionFromChannel in
162 // nsExternalHelperAppService. Factor it out!
163 if (NS_FAILED(rv) ||
164 (!dispToken.IsEmpty() &&
165 !StringBeginsWithLowercaseLiteral(dispToken, "inline") &&
166 // Broken sites just send
167 // Content-Disposition: filename="file"
168 // without a disposition token... screen those out.
169 !StringBeginsWithLowercaseLiteral(dispToken, "filename") &&
170 // Also in use is Content-Disposition: name="file"
171 !StringBeginsWithLowercaseLiteral(dispToken, "name")))
172 // We have a content-disposition of "attachment" or unknown
173 return PR_TRUE;
177 return PR_FALSE;
181 * @return the first occurrence of a character within a string buffer,
182 * or nsnull if not found
184 static const char*
185 FindChar(char c, const char *begin, const char *end)
187 for (; begin < end; ++begin) {
188 if (*begin == c)
189 return begin;
191 return nsnull;
196 * Determine if a substring is the "documentElement" in the document.
198 * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
199 * element within the XML DOM, i.e. the root container element. Otherwise,
200 * it's possible that someone embedded one of these tags inside a document of
201 * another type, e.g. a HTML document, and we don't want to show the preview
202 * page if the document isn't actually a feed.
204 * @param start
205 * The beginning of the data being sniffed
206 * @param end
207 * The end of the data being sniffed, right before the substring that
208 * was found.
209 * @returns PR_TRUE if the found substring is the documentElement, PR_FALSE
210 * otherwise.
212 static PRBool
213 IsDocumentElement(const char *start, const char* end)
215 // For every tag in the buffer, check to see if it's a PI, Doctype or
216 // comment, our desired substring or something invalid.
217 while ( (start = FindChar('<', start, end)) ) {
218 ++start;
219 if (start >= end)
220 return PR_FALSE;
222 // Check to see if the character following the '<' is either '?' or '!'
223 // (processing instruction or doctype or comment)... these are valid nodes
224 // to have in the prologue.
225 if (*start != '?' && *start != '!')
226 return PR_FALSE;
228 // Now advance the iterator until the '>' (We do this because we don't want
229 // to sniff indicator substrings that are embedded within other nodes, e.g.
230 // comments: <!-- <rdf:RDF .. > -->
231 start = FindChar('>', start, end);
232 if (!start)
233 return PR_FALSE;
235 ++start;
237 return PR_TRUE;
241 * Determines whether or not a string exists as the root element in an XML data
242 * string buffer.
243 * @param dataString
244 * The data being sniffed
245 * @param substring
246 * The substring being tested for existence and root-ness.
247 * @returns PR_TRUE if the substring exists and is the documentElement, PR_FALSE
248 * otherwise.
250 static PRBool
251 ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
253 PRInt32 offset = dataString.Find(substring);
254 if (offset == -1)
255 return PR_FALSE;
257 const char *begin = dataString.BeginReading();
259 // Only do the validation when we find the substring.
260 return IsDocumentElement(begin, begin + offset);
263 NS_IMETHODIMP
264 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
265 const PRUint8* data,
266 PRUint32 length,
267 nsACString& sniffedType)
269 nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
270 if (!channel)
271 return NS_ERROR_NO_INTERFACE;
273 // Check that this is a GET request, since you can't subscribe to a POST...
274 nsCAutoString method;
275 channel->GetRequestMethod(method);
276 if (!method.Equals("GET")) {
277 sniffedType.Truncate();
278 return NS_OK;
281 // We need to find out if this is a load of a view-source document. In this
282 // case we do not want to override the content type, since the source display
283 // does not need to be converted from feed format to XUL. More importantly,
284 // we don't want to change the content type from something
285 // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
286 // etc) to something that only the application fe knows about (maybe.feed)
287 // thus deactivating syntax highlighting.
288 nsCOMPtr<nsIURI> originalURI;
289 channel->GetOriginalURI(getter_AddRefs(originalURI));
291 nsCAutoString scheme;
292 originalURI->GetScheme(scheme);
293 if (scheme.EqualsLiteral("view-source")) {
294 sniffedType.Truncate();
295 return NS_OK;
298 // Check the Content-Type to see if it is set correctly. If it is set to
299 // something specific that we think is a reliable indication of a feed, don't
300 // bother sniffing since we assume the site maintainer knows what they're
301 // doing.
302 nsCAutoString contentType;
303 channel->GetContentType(contentType);
304 PRBool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
305 contentType.EqualsLiteral(TYPE_ATOM);
307 // Check to see if this was a feed request from the location bar or from
308 // the feed: protocol. This is also a reliable indication.
309 // The value of the header doesn't matter.
310 if (!noSniff) {
311 nsCAutoString sniffHeader;
312 nsresult foundHeader =
313 channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
314 sniffHeader);
315 noSniff = NS_SUCCEEDED(foundHeader);
318 if (noSniff) {
319 // check for an attachment after we have a likely feed.
320 if(HasAttachmentDisposition(channel)) {
321 sniffedType.Truncate();
322 return NS_OK;
325 // set the feed header as a response header, since we have good metadata
326 // telling us that the feed is supposed to be RSS or Atom
327 channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
328 NS_LITERAL_CSTRING("1"), PR_FALSE);
329 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
330 return NS_OK;
333 // Don't sniff arbitrary types. Limit sniffing to situations that
334 // we think can reasonably arise.
335 if (!contentType.EqualsLiteral(TEXT_HTML) &&
336 !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
337 // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
338 // and check for text/xml and application/xml by hand instead?
339 contentType.Find("xml") == -1) {
340 sniffedType.Truncate();
341 return NS_OK;
344 // Now we need to potentially decompress data served with
345 // Content-Encoding: gzip
346 nsresult rv = ConvertEncodedData(request, data, length);
347 if (NS_FAILED(rv))
348 return rv;
350 const char* testData =
351 mDecodedData.IsEmpty() ? (const char*)data : mDecodedData.get();
353 // The strategy here is based on that described in:
354 // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
355 // for interoperarbility purposes.
357 // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
358 // false positives by accidentally reading document content, e.g. a "how to
359 // make a feed" page.
360 if (length > MAX_BYTES)
361 length = MAX_BYTES;
363 // Thus begins the actual sniffing.
364 nsDependentCSubstring dataString((const char*)testData, length);
366 PRBool isFeed = PR_FALSE;
368 // RSS 0.91/0.92/2.0
369 isFeed = ContainsTopLevelSubstring(dataString, "<rss");
371 // Atom 1.0
372 if (!isFeed)
373 isFeed = ContainsTopLevelSubstring(dataString, "<feed");
375 // RSS 1.0
376 if (!isFeed) {
377 isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
378 dataString.Find(NS_RDF) != -1 &&
379 dataString.Find(NS_RSS) != -1;
382 // If we sniffed a feed, coerce our internal type
383 if (isFeed && !HasAttachmentDisposition(channel))
384 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
385 else
386 sniffedType.Truncate();
387 return NS_OK;
390 NS_IMETHODIMP
391 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
393 return NS_OK;
396 NS_METHOD
397 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
398 void* closure,
399 const char* rawSegment,
400 PRUint32 toOffset,
401 PRUint32 count,
402 PRUint32* writeCount)
404 nsCString* decodedData = static_cast<nsCString*>(closure);
405 decodedData->Append(rawSegment, count);
406 *writeCount = count;
407 return NS_OK;
410 NS_IMETHODIMP
411 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
412 nsIInputStream* stream, PRUint32 offset,
413 PRUint32 count)
415 PRUint32 read;
416 return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
417 &read);
420 NS_IMETHODIMP
421 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
422 nsresult status)
424 return NS_OK;
427 NS_METHOD
428 nsFeedSniffer::Register(nsIComponentManager *compMgr, nsIFile *path,
429 const char *registryLocation,
430 const char *componentType,
431 const nsModuleComponentInfo *info)
433 nsresult rv;
434 nsCOMPtr<nsICategoryManager> catman = do_GetService(NS_CATEGORYMANAGER_CONTRACTID, &rv);
435 if (NS_FAILED(rv))
436 return rv;
438 return catman->AddCategoryEntry(NS_CONTENT_SNIFFER_CATEGORY, "Feed Sniffer",
439 NS_FEEDSNIFFER_CONTRACTID, PR_TRUE, PR_TRUE,
440 nsnull);