1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1999
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 #include "nsUnknownDecoder.h"
39 #include "nsIServiceManager.h"
40 #include "nsIStreamConverterService.h"
43 #include "nsIInputStream.h"
44 #include "nsIOutputStream.h"
45 #include "nsMimeTypes.h"
47 #include "nsXPIDLString.h"
48 #include "nsIPrefService.h"
49 #include "nsIPrefBranch.h"
50 #include "nsICategoryManager.h"
51 #include "nsISupportsPrimitives.h"
52 #include "nsIContentSniffer.h"
56 #include "nsIMIMEService.h"
58 #include "nsIViewSourceChannel.h"
59 #include "nsIHttpChannel.h"
63 #define MAX_BUFFER_SIZE 1024
65 nsUnknownDecoder::nsUnknownDecoder()
68 , mRequireHTMLsuffix(PR_FALSE
)
70 nsCOMPtr
<nsIPrefBranch
> prefs
= do_GetService(NS_PREFSERVICE_CONTRACTID
);
73 if (NS_SUCCEEDED(prefs
->GetBoolPref("security.requireHTMLsuffix", &val
)))
74 mRequireHTMLsuffix
= val
;
78 nsUnknownDecoder::~nsUnknownDecoder()
88 // nsISupports implementation...
92 NS_IMPL_ADDREF(nsUnknownDecoder
)
93 NS_IMPL_RELEASE(nsUnknownDecoder
)
95 NS_INTERFACE_MAP_BEGIN(nsUnknownDecoder
)
96 NS_INTERFACE_MAP_ENTRY(nsIStreamConverter
)
97 NS_INTERFACE_MAP_ENTRY(nsIStreamListener
)
98 NS_INTERFACE_MAP_ENTRY(nsIRequestObserver
)
99 NS_INTERFACE_MAP_ENTRY(nsIContentSniffer
)
100 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports
, nsIStreamListener
)
106 // nsIStreamConverter methods...
111 nsUnknownDecoder::Convert(nsIInputStream
*aFromStream
,
112 const char *aFromType
,
115 nsIInputStream
**aResultStream
)
117 return NS_ERROR_NOT_IMPLEMENTED
;
121 nsUnknownDecoder::AsyncConvertData(const char *aFromType
,
123 nsIStreamListener
*aListener
,
126 NS_ASSERTION(aListener
&& aFromType
&& aToType
,
127 "null pointer passed into multi mixed converter");
128 // hook up our final listener. this guy gets the various On*() calls we want to throw
131 mNextListener
= aListener
;
132 return (aListener
) ? NS_OK
: NS_ERROR_FAILURE
;
137 // nsIStreamListener methods...
142 nsUnknownDecoder::OnDataAvailable(nsIRequest
* request
,
144 nsIInputStream
*aStream
,
145 PRUint32 aSourceOffset
,
150 if (!mNextListener
) return NS_ERROR_FAILURE
;
152 if (mContentType
.IsEmpty()) {
155 // If the buffer has not been allocated by now, just fail...
156 if (!mBuffer
) return NS_ERROR_OUT_OF_MEMORY
;
159 // Determine how much of the stream should be read to fill up the
162 if (mBufferLen
+ aCount
>= MAX_BUFFER_SIZE
) {
163 count
= MAX_BUFFER_SIZE
-mBufferLen
;
168 // Read the data into the buffer...
169 rv
= aStream
->Read((mBuffer
+mBufferLen
), count
, &len
);
170 if (NS_FAILED(rv
)) return rv
;
177 // Adjust the source offset... The call to FireListenerNotifications(...)
178 // will make the first OnDataAvailable(...) call with an offset of 0.
179 // So, this offset needs to be adjusted to reflect that...
181 aSourceOffset
+= mBufferLen
;
183 DetermineContentType(request
);
185 rv
= FireListenerNotifications(request
, aCtxt
);
189 // Must not fire ODA again if it failed once
190 if (aCount
&& NS_SUCCEEDED(rv
)) {
191 NS_ASSERTION(!mContentType
.IsEmpty(),
192 "Content type should be known by now.");
194 rv
= mNextListener
->OnDataAvailable(request
, aCtxt
, aStream
,
195 aSourceOffset
, aCount
);
203 // nsIRequestObserver methods...
208 nsUnknownDecoder::OnStartRequest(nsIRequest
* request
, nsISupports
*aCtxt
)
212 if (!mNextListener
) return NS_ERROR_FAILURE
;
214 // Allocate the sniffer buffer...
215 if (NS_SUCCEEDED(rv
) && !mBuffer
) {
216 mBuffer
= new char[MAX_BUFFER_SIZE
];
219 rv
= NS_ERROR_OUT_OF_MEMORY
;
223 // Do not pass the OnStartRequest on to the next listener (yet)...
228 nsUnknownDecoder::OnStopRequest(nsIRequest
* request
, nsISupports
*aCtxt
,
233 if (!mNextListener
) return NS_ERROR_FAILURE
;
236 // The total amount of data is less than the size of the sniffer buffer.
237 // Analyze the buffer now...
239 if (mContentType
.IsEmpty()) {
240 DetermineContentType(request
);
242 rv
= FireListenerNotifications(request
, aCtxt
);
249 rv
= mNextListener
->OnStopRequest(request
, aCtxt
, aStatus
);
257 // nsIContentSniffer methods...
261 nsUnknownDecoder::GetMIMETypeFromContent(nsIRequest
* aRequest
,
262 const PRUint8
* aData
,
266 mBuffer
= const_cast<char*>(reinterpret_cast<const char*>(aData
));
267 mBufferLen
= aLength
;
268 DetermineContentType(aRequest
);
271 type
.Assign(mContentType
);
272 mContentType
.Truncate();
273 return type
.IsEmpty() ? NS_ERROR_NOT_AVAILABLE
: NS_OK
;
277 // Actual sniffing code
279 PRBool
nsUnknownDecoder::AllowSniffing(nsIRequest
* aRequest
)
281 if (!mRequireHTMLsuffix
) {
285 nsCOMPtr
<nsIChannel
> channel
= do_QueryInterface(aRequest
);
287 NS_ERROR("QI failed");
291 nsCOMPtr
<nsIURI
> uri
;
292 if (NS_FAILED(channel
->GetURI(getter_AddRefs(uri
))) || !uri
) {
296 PRBool isLocalFile
= PR_FALSE
;
297 if (NS_FAILED(uri
->SchemeIs("file", &isLocalFile
)) || isLocalFile
) {
305 * This is the array of sniffer entries that depend on "magic numbers"
306 * in the file. Each entry has either a type associated with it (set
307 * these with the SNIFFER_ENTRY macro) or a function to be executed
308 * (set these with the SNIFFER_ENTRY_WITH_FUNC macro). The function
309 * should take a single nsIRequest* and returns PRBool -- PR_TRUE if
310 * it sets mContentType, PR_FALSE otherwise
312 nsUnknownDecoder::nsSnifferEntry
nsUnknownDecoder::sSnifferEntries
[] = {
313 SNIFFER_ENTRY("%PDF-", APPLICATION_PDF
),
315 SNIFFER_ENTRY("%!PS-Adobe-", APPLICATION_POSTSCRIPT
),
316 SNIFFER_ENTRY("%! PS-Adobe-", APPLICATION_POSTSCRIPT
),
318 // Files that start with mailbox delimiters let's provisionally call
320 SNIFFER_ENTRY("From", TEXT_PLAIN
),
321 SNIFFER_ENTRY(">From", TEXT_PLAIN
),
323 // If the buffer begins with "#!" or "%!" then it is a script of
324 // some sort... "Scripts" can include arbitrary data to be passed
325 // to an interpreter, so we need to decide whether we can call this
326 // text or whether it's data.
327 SNIFFER_ENTRY_WITH_FUNC("#!", &nsUnknownDecoder::LastDitchSniff
),
329 // XXXbz should (and can) we also include the various ways that <?xml can
330 // appear as UTF-16 and such? See http://www.w3.org/TR/REC-xml#sec-guessing
331 SNIFFER_ENTRY_WITH_FUNC("<?xml", &nsUnknownDecoder::SniffForXML
)
334 PRUint32
nsUnknownDecoder::sSnifferEntryNum
=
335 sizeof(nsUnknownDecoder::sSnifferEntries
) /
336 sizeof(nsUnknownDecoder::nsSnifferEntry
);
338 void nsUnknownDecoder::DetermineContentType(nsIRequest
* aRequest
)
340 NS_ASSERTION(mContentType
.IsEmpty(), "Content type is already known.");
341 if (!mContentType
.IsEmpty()) return;
343 // First, run through all the types we can detect reliably based on
346 for (i
= 0; i
< sSnifferEntryNum
; ++i
) {
347 if (mBufferLen
>= sSnifferEntries
[i
].mByteLen
&& // enough data
348 memcmp(mBuffer
, sSnifferEntries
[i
].mBytes
, sSnifferEntries
[i
].mByteLen
) == 0) { // and type matches
349 NS_ASSERTION(sSnifferEntries
[i
].mMimeType
||
350 sSnifferEntries
[i
].mContentTypeSniffer
,
351 "Must have either a type string or a function to set the type");
352 NS_ASSERTION(sSnifferEntries
[i
].mMimeType
== nsnull
||
353 sSnifferEntries
[i
].mContentTypeSniffer
== nsnull
,
354 "Both a type string and a type sniffing function set;"
355 " using type string");
356 if (sSnifferEntries
[i
].mMimeType
) {
357 mContentType
= sSnifferEntries
[i
].mMimeType
;
358 NS_ASSERTION(!mContentType
.IsEmpty(),
359 "Content type should be known by now.");
362 if ((this->*(sSnifferEntries
[i
].mContentTypeSniffer
))(aRequest
)) {
363 NS_ASSERTION(!mContentType
.IsEmpty(),
364 "Content type should be known by now.");
370 if (TryContentSniffers(aRequest
)) {
371 NS_ASSERTION(!mContentType
.IsEmpty(),
372 "Content type should be known by now.");
376 if (SniffForHTML(aRequest
)) {
377 NS_ASSERTION(!mContentType
.IsEmpty(),
378 "Content type should be known by now.");
382 // We don't know what this is yet. Before we just give up, try
383 // the URI from the request.
384 if (SniffURI(aRequest
)) {
385 NS_ASSERTION(!mContentType
.IsEmpty(),
386 "Content type should be known by now.");
390 LastDitchSniff(aRequest
);
391 NS_ASSERTION(!mContentType
.IsEmpty(),
392 "Content type should be known by now.");
395 PRBool
nsUnknownDecoder::TryContentSniffers(nsIRequest
* aRequest
)
397 // Enumerate content sniffers
398 nsCOMPtr
<nsICategoryManager
> catMan(do_GetService("@mozilla.org/categorymanager;1"));
403 nsCOMPtr
<nsISimpleEnumerator
> sniffers
;
404 catMan
->EnumerateCategory("content-sniffing-services", getter_AddRefs(sniffers
));
410 while (NS_SUCCEEDED(sniffers
->HasMoreElements(&hasMore
)) && hasMore
) {
411 nsCOMPtr
<nsISupports
> elem
;
412 sniffers
->GetNext(getter_AddRefs(elem
));
413 NS_ASSERTION(elem
, "No element even though hasMore returned true!?");
415 nsCOMPtr
<nsISupportsCString
> sniffer_id(do_QueryInterface(elem
));
416 NS_ASSERTION(sniffer_id
, "element is no nsISupportsCString!?");
417 nsCAutoString contractid
;
418 nsresult rv
= sniffer_id
->GetData(contractid
);
423 nsCOMPtr
<nsIContentSniffer
> sniffer(do_GetService(contractid
.get()));
428 rv
= sniffer
->GetMIMETypeFromContent(aRequest
, (const PRUint8
*)mBuffer
,
429 mBufferLen
, mContentType
);
430 if (NS_SUCCEEDED(rv
)) {
438 PRBool
nsUnknownDecoder::SniffForHTML(nsIRequest
* aRequest
)
441 * To prevent a possible attack, we will not consider this to be
442 * html content if it comes from the local file system and our prefs
445 if (!AllowSniffing(aRequest
)) {
449 // Now look for HTML.
450 const char* str
= mBuffer
;
451 const char* end
= mBuffer
+ mBufferLen
;
453 // skip leading whitespace
454 while (str
!= end
&& nsCRT::IsAsciiSpace(*str
)) {
458 // did we find something like a start tag?
459 if (str
== end
|| *str
!= '<' || ++str
== end
) {
463 // If we seem to be SGML or XML and we got down here, just pretend we're HTML
464 if (*str
== '!' || *str
== '?') {
465 mContentType
= TEXT_HTML
;
469 PRUint32 bufSize
= end
- str
;
470 // We use sizeof(_tagstr) below because that's the length of _tagstr
471 // with the one char " " or ">" appended.
472 #define MATCHES_TAG(_tagstr) \
473 (bufSize >= sizeof(_tagstr) && \
474 (PL_strncasecmp(str, _tagstr " ", sizeof(_tagstr)) == 0 || \
475 PL_strncasecmp(str, _tagstr ">", sizeof(_tagstr)) == 0))
477 if (MATCHES_TAG("html") ||
478 MATCHES_TAG("frameset") ||
479 MATCHES_TAG("body") ||
480 MATCHES_TAG("head") ||
481 MATCHES_TAG("script") ||
482 MATCHES_TAG("iframe") ||
484 MATCHES_TAG("img") ||
485 MATCHES_TAG("table") ||
486 MATCHES_TAG("title") ||
487 MATCHES_TAG("link") ||
488 MATCHES_TAG("base") ||
489 MATCHES_TAG("style") ||
490 MATCHES_TAG("div") ||
492 MATCHES_TAG("font") ||
493 MATCHES_TAG("applet") ||
494 MATCHES_TAG("meta") ||
495 MATCHES_TAG("center") ||
496 MATCHES_TAG("form") ||
497 MATCHES_TAG("isindex") ||
505 MATCHES_TAG("pre")) {
507 mContentType
= TEXT_HTML
;
516 PRBool
nsUnknownDecoder::SniffForXML(nsIRequest
* aRequest
)
518 // Just like HTML, this should be able to be shut off.
519 if (!AllowSniffing(aRequest
)) {
523 // First see whether we can glean anything from the uri...
524 if (!SniffURI(aRequest
)) {
525 // Oh well; just generic XML will have to do
526 mContentType
= TEXT_XML
;
532 PRBool
nsUnknownDecoder::SniffURI(nsIRequest
* aRequest
)
534 nsCOMPtr
<nsIMIMEService
> mimeService(do_GetService("@mozilla.org/mime;1"));
536 nsCOMPtr
<nsIChannel
> channel
= do_QueryInterface(aRequest
);
538 nsCOMPtr
<nsIURI
> uri
;
539 nsresult result
= channel
->GetURI(getter_AddRefs(uri
));
540 if (NS_SUCCEEDED(result
) && uri
) {
542 result
= mimeService
->GetTypeFromURI(uri
, type
);
543 if (NS_SUCCEEDED(result
)) {
554 // This macro is based on RFC 2046 Section 4.1.2. Treat any char 0-31
555 // except the 9-13 range (\t, \n, \v, \f, \r) and char 27 (used by
556 // encodings like Shift_JIS) as non-text
557 #define IS_TEXT_CHAR(ch) \
558 (((unsigned char)(ch)) > 31 || (9 <= (ch) && (ch) <= 13) || (ch) == 27)
560 PRBool
nsUnknownDecoder::LastDitchSniff(nsIRequest
* aRequest
)
562 // All we can do now is try to guess whether this is text/plain or
563 // application/octet-stream
565 // First, check for a BOM. If we see one, assume this is text/plain
566 // in whatever encoding. If there is a BOM _and_ text we will
567 // always have at least 4 bytes in the buffer (since the 2-byte BOMs
568 // are for 2-byte encodings and the UTF-8 BOM is 3 bytes).
569 if (mBufferLen
>= 4) {
570 const unsigned char* buf
= (const unsigned char*)mBuffer
;
571 if ((buf
[0] == 0xFE && buf
[1] == 0xFF) || // UTF-16, Big Endian
572 (buf
[0] == 0xFF && buf
[1] == 0xFE) || // UTF-16 or UCS-4, Little Endian
573 (buf
[0] == 0xEF && buf
[1] == 0xBB && buf
[2] == 0xBF) || // UTF-8
574 (buf
[0] == 0 && buf
[1] == 0 && buf
[2] == 0xFE && buf
[3] == 0xFF)) { // UCS-4, Big Endian
576 mContentType
= TEXT_PLAIN
;
581 // Now see whether the buffer has any non-text chars. If not, then let's
582 // just call it text/plain...
585 for (i
=0; i
<mBufferLen
&& IS_TEXT_CHAR(mBuffer
[i
]); i
++);
587 if (i
== mBufferLen
) {
588 mContentType
= TEXT_PLAIN
;
591 mContentType
= APPLICATION_OCTET_STREAM
;
598 nsresult
nsUnknownDecoder::FireListenerNotifications(nsIRequest
* request
,
603 if (!mNextListener
) return NS_ERROR_FAILURE
;
605 if (!mContentType
.IsEmpty()) {
606 nsCOMPtr
<nsIViewSourceChannel
> viewSourceChannel
=
607 do_QueryInterface(request
);
608 if (viewSourceChannel
) {
609 rv
= viewSourceChannel
->SetOriginalContentType(mContentType
);
611 nsCOMPtr
<nsIChannel
> channel
= do_QueryInterface(request
, &rv
);
612 if (NS_SUCCEEDED(rv
)) {
613 // Set the new content type on the channel...
614 rv
= channel
->SetContentType(mContentType
);
618 NS_ASSERTION(NS_SUCCEEDED(rv
), "Unable to set content type on channel!");
621 // Cancel the request to make sure it has the correct status if
622 // mNextListener looks at it.
624 mNextListener
->OnStartRequest(request
, aCtxt
);
629 // Fire the OnStartRequest(...)
630 rv
= mNextListener
->OnStartRequest(request
, aCtxt
);
632 if (!mBuffer
) return NS_ERROR_OUT_OF_MEMORY
;
634 // If the request was canceled, then we need to treat that equivalently
635 // to an error returned by OnStartRequest.
636 if (NS_SUCCEEDED(rv
))
637 request
->GetStatus(&rv
);
639 // Fire the first OnDataAvailable for the data that was read from the
640 // stream into the sniffer buffer...
641 if (NS_SUCCEEDED(rv
) && (mBufferLen
> 0)) {
643 nsCOMPtr
<nsIInputStream
> in
;
644 nsCOMPtr
<nsIOutputStream
> out
;
646 // Create a pipe and fill it with the data from the sniffer buffer.
647 rv
= NS_NewPipe(getter_AddRefs(in
), getter_AddRefs(out
),
648 MAX_BUFFER_SIZE
, MAX_BUFFER_SIZE
);
650 if (NS_SUCCEEDED(rv
)) {
651 rv
= out
->Write(mBuffer
, mBufferLen
, &len
);
652 if (NS_SUCCEEDED(rv
)) {
653 if (len
== mBufferLen
) {
654 rv
= mNextListener
->OnDataAvailable(request
, aCtxt
, in
, 0, len
);
656 NS_ERROR("Unable to write all the data into the pipe.");
657 rv
= NS_ERROR_FAILURE
;
671 nsBinaryDetector::DetermineContentType(nsIRequest
* aRequest
)
673 nsCOMPtr
<nsIHttpChannel
> httpChannel
= do_QueryInterface(aRequest
);
678 // It's an HTTP channel. Check for the text/plain mess
679 nsCAutoString contentTypeHdr
;
680 httpChannel
->GetResponseHeader(NS_LITERAL_CSTRING("Content-Type"),
682 nsCAutoString contentType
;
683 httpChannel
->GetContentType(contentType
);
685 // Make sure to do a case-sensitive exact match comparison here. Apache
686 // 1.x just sends text/plain for "unknown", while Apache 2.x sends
687 // text/plain with a ISO-8859-1 charset. Debian's Apache version, just to
688 // be different, sends text/plain with iso-8859-1 charset. For extra fun,
689 // FC7, RHEL4, and Ubuntu Feisty send charset=UTF-8. Don't do general
690 // case-insensitive comparison, since we really want to apply this crap as
692 if (!contentType
.EqualsLiteral("text/plain") ||
693 (!contentTypeHdr
.EqualsLiteral("text/plain") &&
694 !contentTypeHdr
.EqualsLiteral("text/plain; charset=ISO-8859-1") &&
695 !contentTypeHdr
.EqualsLiteral("text/plain; charset=iso-8859-1") &&
696 !contentTypeHdr
.EqualsLiteral("text/plain; charset=UTF-8"))) {
700 // Check whether we have content-encoding. If we do, don't try to
702 // XXXbz we could improve this by doing a local decompress if we
704 nsCAutoString contentEncoding
;
705 httpChannel
->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
707 if (!contentEncoding
.IsEmpty()) {
711 LastDitchSniff(aRequest
);
712 if (mContentType
.Equals(APPLICATION_OCTET_STREAM
)) {
713 // We want to guess at it instead
714 mContentType
= APPLICATION_GUESS_FROM_EXT
;
716 // Let the text/plain type we already have be, so that other content
717 // sniffers can also get a shot at this data.
718 mContentType
.Truncate();
723 nsBinaryDetector::Register(nsIComponentManager
* compMgr
, nsIFile
* path
,
724 const char* registryLocation
,
725 const char* componentType
,
726 const nsModuleComponentInfo
*info
)
729 nsCOMPtr
<nsICategoryManager
> catman
=
730 do_GetService(NS_CATEGORYMANAGER_CONTRACTID
, &rv
);
734 return catman
->AddCategoryEntry(NS_CONTENT_SNIFFER_CATEGORY
,
736 NS_BINARYDETECTOR_CONTRACTID
,
737 PR_TRUE
, PR_TRUE
, nsnull
);