MIME4J-5 Performance patch 3, https://issues.apache.org/jira/browse/MIME4J-5. Contrib...
[mime4j.git] / src / main / java / org / apache / james / mime4j / MimeTokenStream.java
blobf3d9dd0e6e7db34e6a4e01b7ab783078d3d7b968
1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
20 package org.apache.james.mime4j;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.nio.charset.Charset;
27 import java.nio.charset.IllegalCharsetNameException;
28 import java.nio.charset.UnsupportedCharsetException;
29 import java.util.LinkedList;
31 import org.apache.james.mime4j.decoder.Base64InputStream;
32 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
33 import org.apache.james.mime4j.util.MimeUtil;
35 /**
36 * <p>
37 * Parses MIME (or RFC822) message streams of bytes or characters.
38 * The stream is converted into an event stream.
39 * <p>
40 * <p>
41 * Typical usage:
42 * </p>
43 * <pre>
44 * MimeTokenStream stream = new MimeTokenStream();
45 * stream.parse(new FileInputStream("mime.msg"));
46 * for (int state = stream.getState();
47 * state != MimeTokenStream.T_END_OF_STREAM;
48 * state = stream.next()) {
49 * switch (state) {
50 * case MimeTokenStream.T_BODY:
51 * System.out.println("Body detected, contents = "
52 * + stream.getInputStream() + ", header data = "
53 * + stream.getBodyDescriptor());
54 * break;
55 * case MimeTokenStream.T_FIELD:
56 * System.out.println("Header field detected: "
57 * + stream.getField());
58 * break;
59 * case MimeTokenStream.T_START_MULTIPART:
60 * System.out.println("Multipart message detexted,"
61 * + " header data = "
62 * + stream.getBodyDescriptor());
63 * ...
64 * }
65 * }
66 * </pre>
67 * <p>
68 * <strong>NOTE:</strong> All lines must end with CRLF
69 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
70 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream}
71 * instance.</p>
72 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
73 * method {@link #parse(InputStream)} resets the token streams internal
74 * state. However, they are definitely <em>not</em> thread safe. If you
75 * have a multi threaded application, then the suggested use is to have
76 * one instance per thread.</p>
78 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
80 public class MimeTokenStream implements EntityStates, RecursionMode {
82 /**
83 * Creates a stream that creates a more detailed body descriptor.
84 * @return <code>MimeTokenStream</code>, not null
86 public static final MimeTokenStream createMaximalDescriptorStream() {
87 return new MimeTokenStream(false, true);
90 /**
91 * Creates a stream that strictly validates the input.
92 * @return <code>MimeTokenStream</code> which throws a
93 * <code>MimeException</code> whenever possible issues
94 * are dedicated in the input
96 public static final MimeTokenStream createStrictValidationStream() {
97 return new MimeTokenStream(true, false);
100 private final boolean strictParsing;
101 private final boolean maximalBodyDescriptor;
102 private final LinkedList entities = new LinkedList();
104 private int state = T_END_OF_STREAM;
105 private EntityStateMachine currentStateMachine;
106 private int recursionMode = M_RECURSE;
107 private InputBuffer inbuffer;
108 private RootInputStream rootInputStream;
111 * Constructs a standard (lax) stream.
112 * Optional validation events will be logged only.
113 * Use {@link #createStrictValidationStream()} to create
114 * a stream that strictly validates the input.
116 public MimeTokenStream() {
117 this(false, false);
120 protected MimeTokenStream(final boolean strictParsing, final boolean maximalBodyDescriptor) {
121 this.strictParsing = strictParsing;
122 this.maximalBodyDescriptor = maximalBodyDescriptor;
125 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
126 * If the {@code MimeTokenStream} has already been in use, resets the streams
127 * internal state.
129 public void parse(InputStream stream) {
130 doParse(stream, null);
133 /** Instructs the {@code MimeTokenStream} to parse the given content with
134 * the content type. The message stream is assumed to have no message header
135 * and is expected to begin with a message body. This can be the case when
136 * the message content is transmitted using a different transport protocol
137 * such as HTTP.
138 * <p/>
139 * If the {@code MimeTokenStream} has already been in use, resets the streams
140 * internal state.
142 public void parseHeadless(InputStream stream, String contentType) {
143 if (contentType == null) {
144 throw new IllegalArgumentException("Content type may not be null");
146 doParse(stream, contentType);
149 private void doParse(InputStream stream, String contentType) {
150 entities.clear();
151 rootInputStream = new RootInputStream(stream);
152 inbuffer = new InputBuffer(rootInputStream, 4 * 1024);
153 switch (recursionMode) {
154 case M_RAW:
155 RawEntity rawentity = new RawEntity(new BasicBufferingInputStream(inbuffer));
156 currentStateMachine = rawentity;
157 break;
158 case M_NO_RECURSE:
159 case M_FLAT:
160 // expected to be called only at start of paring
161 case M_RECURSE:
162 MimeEntity mimeentity = new MimeEntity(
163 rootInputStream,
164 new BasicBufferingInputStream(inbuffer),
165 inbuffer,
166 null,
167 T_START_MESSAGE,
168 T_END_MESSAGE,
169 maximalBodyDescriptor,
170 strictParsing);
171 mimeentity.setRecursionMode(recursionMode);
172 if (contentType != null) {
173 mimeentity.skipHeader(contentType);
175 currentStateMachine = mimeentity;
176 break;
178 entities.add(currentStateMachine);
179 state = currentStateMachine.getState();
183 * Determines if this parser is currently in raw mode.
185 * @return <code>true</code> if in raw mode, <code>false</code>
186 * otherwise.
187 * @see #setRaw(boolean)
189 public boolean isRaw() {
190 return recursionMode == M_RAW;
194 * Gets the current recursion mode.
195 * The recursion mode specifies the approach taken to parsing parts.
196 * {@link #M_RAW} mode does not parse the part at all.
197 * {@link #M_RECURSE} mode recursively parses each mail
198 * when an <code>message/rfc822</code> part is encounted;
199 * {@link #M_NO_RECURSE} does not.
200 * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
202 public int getRecursionMode() {
203 return recursionMode;
207 * Sets the current recursion.
208 * The recursion mode specifies the approach taken to parsing parts.
209 * {@link #M_RAW} mode does not parse the part at all.
210 * {@link #M_RECURSE} mode recursively parses each mail
211 * when an <code>message/rfc822</code> part is encounted;
212 * {@link #M_NO_RECURSE} does not.
213 * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
215 public void setRecursionMode(int mode) {
216 recursionMode = mode;
217 if (currentStateMachine != null) {
218 currentStateMachine.setRecursionMode(mode);
223 * Finishes the parsing and stops reading lines.
224 * NOTE: No more lines will be parsed but the parser
225 * will still call
226 * {@link ContentHandler#endMultipart()},
227 * {@link ContentHandler#endBodyPart()},
228 * {@link ContentHandler#endMessage()}, etc to match previous calls
229 * to
230 * {@link ContentHandler#startMultipart(BodyDescriptor)},
231 * {@link ContentHandler#startBodyPart()},
232 * {@link ContentHandler#startMessage()}, etc.
234 public void stop() {
235 inbuffer.clear();
236 rootInputStream.truncate();
240 * Returns the current state.
242 public int getState() {
243 return state;
247 * This method is valid, if {@link #getState()} returns either of
248 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
249 * It returns the raw entity, preamble, or epilogue contents.
250 * @return Data stream, depending on the current state.
251 * @throws IllegalStateException {@link #getState()} returns an
252 * invalid value.
254 public InputStream getInputStream() {
255 return currentStateMachine.getContentStream();
259 * Gets a reader configured for the current body or body part.
260 * The reader will return a transfer and charset decoded
261 * stream of characters based on the MIME fields with the standard
262 * defaults.
263 * This is a conveniance method and relies on {@link #getInputStream()}.
264 * Consult the javadoc for that method for known limitations.
266 * @return <code>Reader</code>, not null
267 * @see #getInputStream
268 * @throws IllegalStateException {@link #getState()} returns an
269 * invalid value
270 * @throws UnsupportedCharsetException if there is no JVM support
271 * for decoding the charset
272 * @throws IllegalCharsetNameException if the charset name specified
273 * in the mime type is illegal
275 public Reader getReader() {
276 final BodyDescriptor bodyDescriptor = getBodyDescriptor();
277 final String mimeCharset = bodyDescriptor.getCharset();
278 final String transferEncoding = bodyDescriptor.getTransferEncoding();
279 final Charset charset;
280 if (mimeCharset == null || "".equals(mimeCharset)) {
281 charset = Charset.forName("US-ASCII");
282 } else {
283 charset = Charset.forName(mimeCharset);
286 final InputStream inputStream;
287 final InputStream transferEncodedStream = getInputStream();
288 if (MimeUtil.isBase64Encoding(transferEncoding)) {
289 inputStream = new Base64InputStream(transferEncodedStream);
290 } else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) {
291 inputStream = new QuotedPrintableInputStream(transferEncodedStream);
292 } else {
293 inputStream = transferEncodedStream;
295 final InputStreamReader result = new InputStreamReader(inputStream, charset);
296 return result;
300 * <p>Gets a descriptor for the current entity.
301 * This method is valid if {@link #getState()} returns:</p>
302 * <ul>
303 * <li>{@link #T_BODY}</li>
304 * <li>{@link #T_START_MULTIPART}</li>
305 * <li>{@link #T_EPILOGUE}</li>
306 * <li>{@link #T_PREAMBLE}</li>
307 * </ul>
308 * @return <code>BodyDescriptor</code>, not nulls
310 public BodyDescriptor getBodyDescriptor() {
311 return currentStateMachine.getBodyDescriptor();
315 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
316 * @return String with the fields raw contents.
317 * @throws IllegalStateException {@link #getState()} returns another
318 * value than {@link #T_FIELD}.
320 public String getField() {
321 return currentStateMachine.getField();
325 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
326 * @return String with the fields name.
327 * @throws IllegalStateException {@link #getState()} returns another
328 * value than {@link #T_FIELD}.
330 public String getFieldName() {
331 return currentStateMachine.getFieldName();
335 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
336 * @return String with the fields value.
337 * @throws IllegalStateException {@link #getState()} returns another
338 * value than {@link #T_FIELD}.
340 public String getFieldValue() {
341 return currentStateMachine.getFieldValue();
345 * This method advances the token stream to the next token.
346 * @throws IllegalStateException The method has been called, although
347 * {@link #getState()} was already {@link #T_END_OF_STREAM}.
349 public int next() throws IOException, MimeException {
350 if (state == T_END_OF_STREAM || currentStateMachine == null) {
351 throw new IllegalStateException("No more tokens are available.");
353 while (currentStateMachine != null) {
354 EntityStateMachine next = currentStateMachine.advance();
355 if (next != null) {
356 entities.add(next);
357 currentStateMachine = next;
359 state = currentStateMachine.getState();
360 if (state != T_END_OF_STREAM) {
361 return state;
363 entities.removeLast();
364 if (entities.isEmpty()) {
365 currentStateMachine = null;
366 } else {
367 currentStateMachine = (EntityStateMachine) entities.getLast();
368 currentStateMachine.setRecursionMode(recursionMode);
371 state = T_END_OF_STREAM;
372 return state;
376 * Renders a state as a string suitable for logging.
377 * @param state
378 * @return rendered as string, not null
380 public static final String stateToString(int state) {
381 return AbstractEntity.stateToString(state);