1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
20 package org
.apache
.james
.mime4j
;
22 import java
.io
.IOException
;
23 import java
.io
.InputStream
;
24 import java
.io
.InputStreamReader
;
25 import java
.io
.Reader
;
26 import java
.nio
.charset
.Charset
;
27 import java
.nio
.charset
.IllegalCharsetNameException
;
28 import java
.nio
.charset
.UnsupportedCharsetException
;
29 import java
.util
.LinkedList
;
31 import org
.apache
.james
.mime4j
.decoder
.Base64InputStream
;
32 import org
.apache
.james
.mime4j
.decoder
.QuotedPrintableInputStream
;
33 import org
.apache
.james
.mime4j
.util
.MimeUtil
;
37 * Parses MIME (or RFC822) message streams of bytes or characters.
38 * The stream is converted into an event stream.
44 * MimeTokenStream stream = new MimeTokenStream();
45 * stream.parse(new FileInputStream("mime.msg"));
46 * for (int state = stream.getState();
47 * state != MimeTokenStream.T_END_OF_STREAM;
48 * state = stream.next()) {
50 * case MimeTokenStream.T_BODY:
51 * System.out.println("Body detected, contents = "
52 * + stream.getInputStream() + ", header data = "
53 * + stream.getBodyDescriptor());
55 * case MimeTokenStream.T_FIELD:
56 * System.out.println("Header field detected: "
57 * + stream.getField());
59 * case MimeTokenStream.T_START_MULTIPART:
60 * System.out.println("Multipart message detexted,"
62 * + stream.getBodyDescriptor());
68 * <strong>NOTE:</strong> All lines must end with CRLF
69 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
70 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream}
72 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
73 * method {@link #parse(InputStream)} resets the token streams internal
74 * state. However, they are definitely <em>not</em> thread safe. If you
75 * have a multi threaded application, then the suggested use is to have
76 * one instance per thread.</p>
78 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
80 public class MimeTokenStream
implements EntityStates
, RecursionMode
{
83 * Creates a stream that creates a more detailed body descriptor.
84 * @return <code>MimeTokenStream</code>, not null
86 public static final MimeTokenStream
createMaximalDescriptorStream() {
87 return new MimeTokenStream(false, true);
91 * Creates a stream that strictly validates the input.
92 * @return <code>MimeTokenStream</code> which throws a
93 * <code>MimeException</code> whenever possible issues
94 * are dedicated in the input
96 public static final MimeTokenStream
createStrictValidationStream() {
97 return new MimeTokenStream(true, false);
100 private final boolean strictParsing
;
101 private final boolean maximalBodyDescriptor
;
102 private final LinkedList entities
= new LinkedList();
104 private int state
= T_END_OF_STREAM
;
105 private EntityStateMachine currentStateMachine
;
106 private int recursionMode
= M_RECURSE
;
107 private InputBuffer inbuffer
;
108 private RootInputStream rootInputStream
;
111 * Constructs a standard (lax) stream.
112 * Optional validation events will be logged only.
113 * Use {@link #createStrictValidationStream()} to create
114 * a stream that strictly validates the input.
116 public MimeTokenStream() {
120 protected MimeTokenStream(final boolean strictParsing
, final boolean maximalBodyDescriptor
) {
121 this.strictParsing
= strictParsing
;
122 this.maximalBodyDescriptor
= maximalBodyDescriptor
;
125 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
126 * If the {@code MimeTokenStream} has already been in use, resets the streams
129 public void parse(InputStream stream
) {
130 doParse(stream
, null);
133 /** Instructs the {@code MimeTokenStream} to parse the given content with
134 * the content type. The message stream is assumed to have no message header
135 * and is expected to begin with a message body. This can be the case when
136 * the message content is transmitted using a different transport protocol
139 * If the {@code MimeTokenStream} has already been in use, resets the streams
142 public void parseHeadless(InputStream stream
, String contentType
) {
143 if (contentType
== null) {
144 throw new IllegalArgumentException("Content type may not be null");
146 doParse(stream
, contentType
);
149 private void doParse(InputStream stream
, String contentType
) {
151 rootInputStream
= new RootInputStream(stream
);
152 inbuffer
= new InputBuffer(rootInputStream
, 4 * 1024);
153 switch (recursionMode
) {
155 RawEntity rawentity
= new RawEntity(new BasicBufferingInputStream(inbuffer
));
156 currentStateMachine
= rawentity
;
160 // expected to be called only at start of paring
162 MimeEntity mimeentity
= new MimeEntity(
164 new BasicBufferingInputStream(inbuffer
),
169 maximalBodyDescriptor
,
171 mimeentity
.setRecursionMode(recursionMode
);
172 if (contentType
!= null) {
173 mimeentity
.skipHeader(contentType
);
175 currentStateMachine
= mimeentity
;
178 entities
.add(currentStateMachine
);
179 state
= currentStateMachine
.getState();
183 * Determines if this parser is currently in raw mode.
185 * @return <code>true</code> if in raw mode, <code>false</code>
187 * @see #setRaw(boolean)
189 public boolean isRaw() {
190 return recursionMode
== M_RAW
;
194 * Gets the current recursion mode.
195 * The recursion mode specifies the approach taken to parsing parts.
196 * {@link #M_RAW} mode does not parse the part at all.
197 * {@link #M_RECURSE} mode recursively parses each mail
198 * when an <code>message/rfc822</code> part is encounted;
199 * {@link #M_NO_RECURSE} does not.
200 * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
202 public int getRecursionMode() {
203 return recursionMode
;
207 * Sets the current recursion.
208 * The recursion mode specifies the approach taken to parsing parts.
209 * {@link #M_RAW} mode does not parse the part at all.
210 * {@link #M_RECURSE} mode recursively parses each mail
211 * when an <code>message/rfc822</code> part is encounted;
212 * {@link #M_NO_RECURSE} does not.
213 * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
215 public void setRecursionMode(int mode
) {
216 recursionMode
= mode
;
217 if (currentStateMachine
!= null) {
218 currentStateMachine
.setRecursionMode(mode
);
223 * Finishes the parsing and stops reading lines.
224 * NOTE: No more lines will be parsed but the parser
226 * {@link ContentHandler#endMultipart()},
227 * {@link ContentHandler#endBodyPart()},
228 * {@link ContentHandler#endMessage()}, etc to match previous calls
230 * {@link ContentHandler#startMultipart(BodyDescriptor)},
231 * {@link ContentHandler#startBodyPart()},
232 * {@link ContentHandler#startMessage()}, etc.
236 rootInputStream
.truncate();
240 * Returns the current state.
242 public int getState() {
247 * This method is valid, if {@link #getState()} returns either of
248 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
249 * It returns the raw entity, preamble, or epilogue contents.
250 * @return Data stream, depending on the current state.
251 * @throws IllegalStateException {@link #getState()} returns an
254 public InputStream
getInputStream() {
255 return currentStateMachine
.getContentStream();
259 * Gets a reader configured for the current body or body part.
260 * The reader will return a transfer and charset decoded
261 * stream of characters based on the MIME fields with the standard
263 * This is a conveniance method and relies on {@link #getInputStream()}.
264 * Consult the javadoc for that method for known limitations.
266 * @return <code>Reader</code>, not null
267 * @see #getInputStream
268 * @throws IllegalStateException {@link #getState()} returns an
270 * @throws UnsupportedCharsetException if there is no JVM support
271 * for decoding the charset
272 * @throws IllegalCharsetNameException if the charset name specified
273 * in the mime type is illegal
275 public Reader
getReader() {
276 final BodyDescriptor bodyDescriptor
= getBodyDescriptor();
277 final String mimeCharset
= bodyDescriptor
.getCharset();
278 final String transferEncoding
= bodyDescriptor
.getTransferEncoding();
279 final Charset charset
;
280 if (mimeCharset
== null || "".equals(mimeCharset
)) {
281 charset
= Charset
.forName("US-ASCII");
283 charset
= Charset
.forName(mimeCharset
);
286 final InputStream inputStream
;
287 final InputStream transferEncodedStream
= getInputStream();
288 if (MimeUtil
.isBase64Encoding(transferEncoding
)) {
289 inputStream
= new Base64InputStream(transferEncodedStream
);
290 } else if (MimeUtil
.isQuotedPrintableEncoded(transferEncoding
)) {
291 inputStream
= new QuotedPrintableInputStream(transferEncodedStream
);
293 inputStream
= transferEncodedStream
;
295 final InputStreamReader result
= new InputStreamReader(inputStream
, charset
);
300 * <p>Gets a descriptor for the current entity.
301 * This method is valid if {@link #getState()} returns:</p>
303 * <li>{@link #T_BODY}</li>
304 * <li>{@link #T_START_MULTIPART}</li>
305 * <li>{@link #T_EPILOGUE}</li>
306 * <li>{@link #T_PREAMBLE}</li>
308 * @return <code>BodyDescriptor</code>, not nulls
310 public BodyDescriptor
getBodyDescriptor() {
311 return currentStateMachine
.getBodyDescriptor();
315 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
316 * @return String with the fields raw contents.
317 * @throws IllegalStateException {@link #getState()} returns another
318 * value than {@link #T_FIELD}.
320 public String
getField() {
321 return currentStateMachine
.getField();
325 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
326 * @return String with the fields name.
327 * @throws IllegalStateException {@link #getState()} returns another
328 * value than {@link #T_FIELD}.
330 public String
getFieldName() {
331 return currentStateMachine
.getFieldName();
335 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
336 * @return String with the fields value.
337 * @throws IllegalStateException {@link #getState()} returns another
338 * value than {@link #T_FIELD}.
340 public String
getFieldValue() {
341 return currentStateMachine
.getFieldValue();
345 * This method advances the token stream to the next token.
346 * @throws IllegalStateException The method has been called, although
347 * {@link #getState()} was already {@link #T_END_OF_STREAM}.
349 public int next() throws IOException
, MimeException
{
350 if (state
== T_END_OF_STREAM
|| currentStateMachine
== null) {
351 throw new IllegalStateException("No more tokens are available.");
353 while (currentStateMachine
!= null) {
354 EntityStateMachine next
= currentStateMachine
.advance();
357 currentStateMachine
= next
;
359 state
= currentStateMachine
.getState();
360 if (state
!= T_END_OF_STREAM
) {
363 entities
.removeLast();
364 if (entities
.isEmpty()) {
365 currentStateMachine
= null;
367 currentStateMachine
= (EntityStateMachine
) entities
.getLast();
368 currentStateMachine
.setRecursionMode(recursionMode
);
371 state
= T_END_OF_STREAM
;
376 * Renders a state as a string suitable for logging.
378 * @return rendered as string, not null
380 public static final String
stateToString(int state
) {
381 return AbstractEntity
.stateToString(state
);