1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is the Feed Content Sniffer.
17 * The Initial Developer of the Original Code is Google Inc.
18 * Portions created by the Initial Developer are Copyright (C) 2006
19 * the Initial Developer. All Rights Reserved.
22 * Ben Goodger <beng@google.com>
23 * Robert Sayre <sayrer@gmail.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #include "nsFeedSniffer.h"
46 #include "nsStringStream.h"
48 #include "nsBrowserCompsCID.h"
50 #include "nsICategoryManager.h"
51 #include "nsIServiceManager.h"
52 #include "nsComponentManagerUtils.h"
53 #include "nsServiceManagerUtils.h"
55 #include "nsIStreamConverterService.h"
56 #include "nsIStreamConverter.h"
58 #include "nsIStreamListener.h"
60 #include "nsIHttpChannel.h"
61 #include "nsIMIMEHeaderParam.h"
63 #include "nsMimeTypes.h"
65 #define TYPE_ATOM "application/atom+xml"
66 #define TYPE_RSS "application/rss+xml"
67 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
69 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
70 #define NS_RSS "http://purl.org/rss/1.0/"
74 NS_IMPL_ISUPPORTS3(nsFeedSniffer
,
80 nsFeedSniffer::ConvertEncodedData(nsIRequest
* request
,
87 nsCOMPtr
<nsIHttpChannel
> httpChannel(do_QueryInterface(request
));
89 return NS_ERROR_NO_INTERFACE
;
91 nsCAutoString contentEncoding
;
92 httpChannel
->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
94 if (!contentEncoding
.IsEmpty()) {
95 nsCOMPtr
<nsIStreamConverterService
> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID
));
96 if (converterService
) {
97 ToLowerCase(contentEncoding
);
99 nsCOMPtr
<nsIStreamListener
> converter
;
100 rv
= converterService
->AsyncConvertData(contentEncoding
.get(),
101 "uncompressed", this, nsnull
,
102 getter_AddRefs(converter
));
103 NS_ENSURE_SUCCESS(rv
, rv
);
105 converter
->OnStartRequest(request
, nsnull
);
107 nsCOMPtr
<nsIStringInputStream
> rawStream
=
108 do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID
);
110 return NS_ERROR_FAILURE
;
112 rv
= rawStream
->SetData((const char*)data
, length
);
113 NS_ENSURE_SUCCESS(rv
, rv
);
115 rv
= converter
->OnDataAvailable(request
, nsnull
, rawStream
, 0, length
);
116 NS_ENSURE_SUCCESS(rv
, rv
);
118 converter
->OnStopRequest(request
, nsnull
, NS_OK
);
126 StringBeginsWithLowercaseLiteral(nsAString
& aString
,
127 const char (&aSubstring
)[N
])
129 return StringHead(aString
, N
).LowerCaseEqualsLiteral(aSubstring
);
132 // XXXsayrer put this in here to get on the branch with minimal delay.
133 // Trunk really needs to factor this out. This is the third usage.
135 HasAttachmentDisposition(nsIHttpChannel
* httpChannel
)
140 nsCAutoString contentDisposition
;
142 httpChannel
->GetResponseHeader(NS_LITERAL_CSTRING("content-disposition"),
145 if (NS_SUCCEEDED(rv
) && !contentDisposition
.IsEmpty()) {
146 nsCOMPtr
<nsIURI
> uri
;
147 httpChannel
->GetURI(getter_AddRefs(uri
));
148 nsCOMPtr
<nsIMIMEHeaderParam
> mimehdrpar
=
149 do_GetService(NS_MIMEHEADERPARAM_CONTRACTID
, &rv
);
150 if (NS_SUCCEEDED(rv
))
152 nsCAutoString fallbackCharset
;
154 uri
->GetOriginCharset(fallbackCharset
);
155 nsAutoString dispToken
;
156 // Get the disposition type
157 rv
= mimehdrpar
->GetParameter(contentDisposition
, "", fallbackCharset
,
158 PR_TRUE
, nsnull
, dispToken
);
159 // RFC 2183, section 2.8 says that an unknown disposition
160 // value should be treated as "attachment"
161 // XXXbz this code is duplicated in GetFilenameAndExtensionFromChannel in
162 // nsExternalHelperAppService. Factor it out!
164 (!dispToken
.IsEmpty() &&
165 !StringBeginsWithLowercaseLiteral(dispToken
, "inline") &&
166 // Broken sites just send
167 // Content-Disposition: filename="file"
168 // without a disposition token... screen those out.
169 !StringBeginsWithLowercaseLiteral(dispToken
, "filename") &&
170 // Also in use is Content-Disposition: name="file"
171 !StringBeginsWithLowercaseLiteral(dispToken
, "name")))
172 // We have a content-disposition of "attachment" or unknown
181 * @return the first occurrence of a character within a string buffer,
182 * or nsnull if not found
185 FindChar(char c
, const char *begin
, const char *end
)
187 for (; begin
< end
; ++begin
) {
196 * Determine if a substring is the "documentElement" in the document.
198 * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
199 * element within the XML DOM, i.e. the root container element. Otherwise,
200 * it's possible that someone embedded one of these tags inside a document of
201 * another type, e.g. a HTML document, and we don't want to show the preview
202 * page if the document isn't actually a feed.
205 * The beginning of the data being sniffed
207 * The end of the data being sniffed, right before the substring that
209 * @returns PR_TRUE if the found substring is the documentElement, PR_FALSE
213 IsDocumentElement(const char *start
, const char* end
)
215 // For every tag in the buffer, check to see if it's a PI, Doctype or
216 // comment, our desired substring or something invalid.
217 while ( (start
= FindChar('<', start
, end
)) ) {
222 // Check to see if the character following the '<' is either '?' or '!'
223 // (processing instruction or doctype or comment)... these are valid nodes
224 // to have in the prologue.
225 if (*start
!= '?' && *start
!= '!')
228 // Now advance the iterator until the '>' (We do this because we don't want
229 // to sniff indicator substrings that are embedded within other nodes, e.g.
230 // comments: <!-- <rdf:RDF .. > -->
231 start
= FindChar('>', start
, end
);
241 * Determines whether or not a string exists as the root element in an XML data
244 * The data being sniffed
246 * The substring being tested for existence and root-ness.
247 * @returns PR_TRUE if the substring exists and is the documentElement, PR_FALSE
251 ContainsTopLevelSubstring(nsACString
& dataString
, const char *substring
)
253 PRInt32 offset
= dataString
.Find(substring
);
257 const char *begin
= dataString
.BeginReading();
259 // Only do the validation when we find the substring.
260 return IsDocumentElement(begin
, begin
+ offset
);
264 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest
* request
,
267 nsACString
& sniffedType
)
269 nsCOMPtr
<nsIHttpChannel
> channel(do_QueryInterface(request
));
271 return NS_ERROR_NO_INTERFACE
;
273 // Check that this is a GET request, since you can't subscribe to a POST...
274 nsCAutoString method
;
275 channel
->GetRequestMethod(method
);
276 if (!method
.Equals("GET")) {
277 sniffedType
.Truncate();
281 // We need to find out if this is a load of a view-source document. In this
282 // case we do not want to override the content type, since the source display
283 // does not need to be converted from feed format to XUL. More importantly,
284 // we don't want to change the content type from something
285 // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
286 // etc) to something that only the application fe knows about (maybe.feed)
287 // thus deactivating syntax highlighting.
288 nsCOMPtr
<nsIURI
> originalURI
;
289 channel
->GetOriginalURI(getter_AddRefs(originalURI
));
291 nsCAutoString scheme
;
292 originalURI
->GetScheme(scheme
);
293 if (scheme
.EqualsLiteral("view-source")) {
294 sniffedType
.Truncate();
298 // Check the Content-Type to see if it is set correctly. If it is set to
299 // something specific that we think is a reliable indication of a feed, don't
300 // bother sniffing since we assume the site maintainer knows what they're
302 nsCAutoString contentType
;
303 channel
->GetContentType(contentType
);
304 PRBool noSniff
= contentType
.EqualsLiteral(TYPE_RSS
) ||
305 contentType
.EqualsLiteral(TYPE_ATOM
);
307 // Check to see if this was a feed request from the location bar or from
308 // the feed: protocol. This is also a reliable indication.
309 // The value of the header doesn't matter.
311 nsCAutoString sniffHeader
;
312 nsresult foundHeader
=
313 channel
->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
315 noSniff
= NS_SUCCEEDED(foundHeader
);
319 // check for an attachment after we have a likely feed.
320 if(HasAttachmentDisposition(channel
)) {
321 sniffedType
.Truncate();
325 // set the feed header as a response header, since we have good metadata
326 // telling us that the feed is supposed to be RSS or Atom
327 channel
->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
328 NS_LITERAL_CSTRING("1"), PR_FALSE
);
329 sniffedType
.AssignLiteral(TYPE_MAYBE_FEED
);
333 // Don't sniff arbitrary types. Limit sniffing to situations that
334 // we think can reasonably arise.
335 if (!contentType
.EqualsLiteral(TEXT_HTML
) &&
336 !contentType
.EqualsLiteral(APPLICATION_OCTET_STREAM
) &&
337 // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
338 // and check for text/xml and application/xml by hand instead?
339 contentType
.Find("xml") == -1) {
340 sniffedType
.Truncate();
344 // Now we need to potentially decompress data served with
345 // Content-Encoding: gzip
346 nsresult rv
= ConvertEncodedData(request
, data
, length
);
350 const char* testData
=
351 mDecodedData
.IsEmpty() ? (const char*)data
: mDecodedData
.get();
353 // The strategy here is based on that described in:
354 // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
355 // for interoperarbility purposes.
357 // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
358 // false positives by accidentally reading document content, e.g. a "how to
359 // make a feed" page.
360 if (length
> MAX_BYTES
)
363 // Thus begins the actual sniffing.
364 nsDependentCSubstring
dataString((const char*)testData
, length
);
366 PRBool isFeed
= PR_FALSE
;
369 isFeed
= ContainsTopLevelSubstring(dataString
, "<rss");
373 isFeed
= ContainsTopLevelSubstring(dataString
, "<feed");
377 isFeed
= ContainsTopLevelSubstring(dataString
, "<rdf:RDF") &&
378 dataString
.Find(NS_RDF
) != -1 &&
379 dataString
.Find(NS_RSS
) != -1;
382 // If we sniffed a feed, coerce our internal type
383 if (isFeed
&& !HasAttachmentDisposition(channel
))
384 sniffedType
.AssignLiteral(TYPE_MAYBE_FEED
);
386 sniffedType
.Truncate();
391 nsFeedSniffer::OnStartRequest(nsIRequest
* request
, nsISupports
* context
)
397 nsFeedSniffer::AppendSegmentToString(nsIInputStream
* inputStream
,
399 const char* rawSegment
,
402 PRUint32
* writeCount
)
404 nsCString
* decodedData
= static_cast<nsCString
*>(closure
);
405 decodedData
->Append(rawSegment
, count
);
411 nsFeedSniffer::OnDataAvailable(nsIRequest
* request
, nsISupports
* context
,
412 nsIInputStream
* stream
, PRUint32 offset
,
416 return stream
->ReadSegments(AppendSegmentToString
, &mDecodedData
, count
,
421 nsFeedSniffer::OnStopRequest(nsIRequest
* request
, nsISupports
* context
,
428 nsFeedSniffer::Register(nsIComponentManager
*compMgr
, nsIFile
*path
,
429 const char *registryLocation
,
430 const char *componentType
,
431 const nsModuleComponentInfo
*info
)
434 nsCOMPtr
<nsICategoryManager
> catman
= do_GetService(NS_CATEGORYMANAGER_CONTRACTID
, &rv
);
438 return catman
->AddCategoryEntry(NS_CONTENT_SNIFFER_CATEGORY
, "Feed Sniffer",
439 NS_FEEDSNIFFER_CONTRACTID
, PR_TRUE
, PR_TRUE
,