1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
20 package org
.apache
.james
.mime4j
;
22 import java
.io
.IOException
;
23 import java
.io
.InputStream
;
24 import java
.util
.ArrayList
;
25 import java
.util
.BitSet
;
26 import java
.util
.List
;
28 import org
.apache
.commons
.logging
.Log
;
29 import org
.apache
.commons
.logging
.LogFactory
;
30 import org
.apache
.james
.mime4j
.decoder
.Base64InputStream
;
31 import org
.apache
.james
.mime4j
.decoder
.QuotedPrintableInputStream
;
36 * Parses MIME (or RFC822) message streams of bytes or characters.
37 * The stream is converted into an event stream.
43 * MimeTokenStream stream = new MimeTokenStream();
44 * stream.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
45 * for (int state = stream.getState();
46 * state != MimeTokenStream.T_END_OF_STREAM;
47 * state = stream.next()) {
49 * case MimeTokenStream.T_BODY:
50 * System.out.println("Body detected, contents = "
51 * + stream.getInputStream() + ", header data = "
52 * + stream.getBodyDescriptor());
54 * case MimeTokenStream.T_FIELD:
55 * System.out.println("Header field detected: "
56 * + stream.getField());
58 * case MimeTokenStream.T_START_MULTIPART:
59 * System.out.println("Multipart message detexted,"
61 * + stream.getBodyDescriptor());
67 * <strong>NOTE:</strong> All lines must end with CRLF
68 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
69 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream}
71 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
72 * method {@link #parse(InputStream)} resets the token streams internal
73 * state. However, they are definitely <em>not</em> thread safe. If you
74 * have a multi threaded application, then the suggested use is to have
75 * one instance per thread.</p>
77 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
79 public class MimeTokenStream
{
80 private static final Log log
= LogFactory
.getLog(MimeStreamParser
.class);
83 * This token indicates, that the MIME stream has been completely
84 * and successfully parsed, and no more data is available.
86 public static final int T_END_OF_STREAM
= -1;
88 * This token indicates, that the MIME stream is currently
89 * at the beginning of a message.
91 public static final int T_START_MESSAGE
= 0;
93 * This token indicates, that the MIME stream is currently
94 * at the end of a message.
96 public static final int T_END_MESSAGE
= 1;
98 * This token indicates, that a raw entity is currently being processed.
99 * You may call {@link #getInputStream()} to obtain the raw entity
102 public static final int T_RAW_ENTITY
= 2;
104 * This token indicates, that a message parts headers are now
107 public static final int T_START_HEADER
= 3;
109 * This token indicates, that a message parts field has now
110 * been parsed. You may call {@link #getField()} to obtain the
111 * raw field contents.
113 public static final int T_FIELD
= 4;
115 * This token indicates, that part headers have now been
118 public static final int T_END_HEADER
= 5;
120 * This token indicates, that a multipart body is being parsed.
122 public static final int T_START_MULTIPART
= 6;
124 * This token indicates, that a multipart body has been parsed.
126 public static final int T_END_MULTIPART
= 7;
128 * This token indicates, that a multiparts preamble is being
129 * parsed. You may call {@link #getInputStream()} to access the
132 public static final int T_PREAMBLE
= 8;
134 * This token indicates, that a multiparts epilogue is being
135 * parsed. You may call {@link #getInputStream()} to access the
138 public static final int T_EPILOGUE
= 9;
140 * This token indicates, that the MIME stream is currently
141 * at the beginning of a body part.
143 public static final int T_START_BODYPART
= 10;
145 * This token indicates, that the MIME stream is currently
146 * at the end of a body part.
148 public static final int T_END_BODYPART
= 11;
150 * This token indicates, that an atomic entity is being parsed.
151 * Use {@link #getInputStream()} to access the entity contents.
153 public static final int T_BODY
= 12;
155 * Internal state, not exposed.
157 private static final int T_IN_BODYPART
= -2;
159 * Internal state, not exposed.
161 private static final int T_IN_MESSAGE
= -3;
163 private static final BitSet fieldChars
= new BitSet();
165 for (int i
= 0x21; i
<= 0x39; i
++) {
168 for (int i
= 0x3b; i
<= 0x7e; i
++) {
173 abstract static class StateMachine
{
175 abstract int next() throws IOException
, MimeException
;
178 private static class RawEntity
extends StateMachine
{
179 private InputStream stream
;
180 RawEntity(InputStream stream
) {
181 this.stream
= stream
;
182 state
= T_RAW_ENTITY
;
185 state
= T_END_OF_STREAM
;
190 private abstract class Entity
extends StateMachine
{
191 private final BodyDescriptor parent
;
192 private final InputStream contents
;
193 private final StringBuffer sb
= new StringBuffer();
194 private BodyDescriptor body
;
195 private int pos
, start
;
196 private int lineNumber
, startLineNumber
;
197 private final int endState
;
198 private MimeBoundaryInputStream mbis
;
202 Entity(InputStream contents
, BodyDescriptor parent
, int startState
, int endState
) {
203 this.parent
= parent
;
204 this.contents
= contents
;
206 this.endState
= endState
;
209 private void setParsingFieldState() {
210 state
= parseField() ? T_FIELD
: T_END_HEADER
;
213 private int setParseBodyPartState() throws IOException
{
215 if (mbis
.parentEOF()) {
216 if (log
.isWarnEnabled()) {
217 log
.warn("Line " + rootInputStream
.getLineNumber()
218 + ": Body part ended prematurely. "
219 + "Higher level boundary detected or "
223 if (mbis
.hasMoreParts()) {
224 mbis
= new MimeBoundaryInputStream(contents
, body
.getBoundary());
226 currentStateMachine
= new RawEntity(mbis
);
228 currentStateMachine
= new BodyPart(mbis
, body
);
230 entities
.add(currentStateMachine
);
231 state
= T_IN_BODYPART
;
232 return currentStateMachine
.state
;
236 stream
= new CloseShieldInputStream(contents
);
240 int next() throws IOException
, MimeException
{
242 case T_START_MESSAGE
:
243 case T_START_BODYPART
:
244 state
= T_START_HEADER
;
248 setParsingFieldState();
251 setParsingFieldState();
254 if (body
.isMultipart()) {
255 state
= T_START_MULTIPART
;
256 } else if (body
.isMessage()) {
257 InputStream is
= contents
;
258 if (body
.isBase64Encoded()) {
259 log
.warn("base64 encoded message/rfc822 detected");
260 is
= new EOLConvertingInputStream(new Base64InputStream(contents
));
261 } else if (body
.isQuotedPrintableEncoded()) {
262 log
.warn("quoted-printable encoded message/rfc822 detected");
263 is
= new EOLConvertingInputStream(new QuotedPrintableInputStream(contents
));
266 return parseMessage(is
, body
);
268 stream
= new CloseShieldInputStream(contents
);
273 case T_START_MULTIPART
:
274 mbis
= new MimeBoundaryInputStream(contents
, body
.getBoundary());
275 stream
= new CloseShieldInputStream(mbis
);
279 return setParseBodyPartState();
281 return setParseBodyPartState();
283 state
= T_END_MULTIPART
;
286 case T_END_MULTIPART
:
291 if (state
== endState
) {
292 state
= T_END_OF_STREAM
;
295 throw new IllegalStateException("Invalid state: " + state
);
300 private void initHeaderParsing() throws IOException
{
301 body
= new BodyDescriptor(parent
);
302 startLineNumber
= lineNumber
= rootInputStream
.getLineNumber();
306 while ((curr
= contents
.read()) != -1) {
307 if (curr
== '\n' && (prev
== '\n' || prev
== 0)) {
309 * [\r]\n[\r]\n or an immediate \r\n have been seen.
311 sb
.deleteCharAt(sb
.length() - 1);
314 sb
.append((char) curr
);
315 prev
= curr
== '\r' ? prev
: curr
;
318 if (curr
== -1 && log
.isWarnEnabled()) {
319 log
.warn("Line " + rootInputStream
.getLineNumber()
320 + ": Unexpected end of headers detected. "
321 + "Boundary detected in header or EOF reached.");
325 private boolean parseField() {
326 while (pos
< sb
.length()) {
327 while (pos
< sb
.length() && sb
.charAt(pos
) != '\r') {
330 if (pos
< sb
.length() - 1 && sb
.charAt(pos
+ 1) != '\n') {
334 if (pos
>= sb
.length() - 2 || fieldChars
.get(sb
.charAt(pos
+ 2))) {
336 * field should be the complete field data excluding the
339 field
= sb
.substring(start
, pos
);
343 * Check for a valid field.
345 int index
= field
.indexOf(':');
346 boolean valid
= false;
347 if (index
!= -1 && fieldChars
.get(field
.charAt(0))) {
349 String fieldName
= field
.substring(0, index
).trim();
350 for (int i
= 0; i
< fieldName
.length(); i
++) {
351 if (!fieldChars
.get(fieldName
.charAt(i
))) {
357 body
.addField(fieldName
, field
.substring(index
+ 1));
358 startLineNumber
= lineNumber
;
364 if (log
.isWarnEnabled()) {
365 log
.warn("Line " + startLineNumber
366 + ": Ignoring invalid field: '" + field
.trim() + "'");
368 startLineNumber
= lineNumber
;
377 private class Message
extends Entity
{
378 Message(InputStream contents
, BodyDescriptor parent
) {
379 super(contents
, parent
, T_START_MESSAGE
, T_END_MESSAGE
);
383 private class BodyPart
extends Entity
{
384 BodyPart(InputStream contents
, BodyDescriptor parent
) {
385 super(contents
, parent
, T_START_BODYPART
, T_END_BODYPART
);
389 private int state
= T_END_OF_STREAM
;
390 private RootInputStream rootInputStream
;
391 private StateMachine currentStateMachine
;
392 private final List entities
= new ArrayList();
395 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
396 * If the {@code MimeTokenStream} has already been in use, resets the streams
399 public void parse(InputStream stream
) {
401 rootInputStream
= new RootInputStream(stream
);
402 state
= parseMessage(rootInputStream
, null);
405 private int parseMessage(InputStream pStream
, BodyDescriptor parent
) {
407 currentStateMachine
= new RawEntity(pStream
);
409 currentStateMachine
= new Message(pStream
, parent
);
411 entities
.add(currentStateMachine
);
412 return currentStateMachine
.state
;
416 * Determines if this parser is currently in raw mode.
418 * @return <code>true</code> if in raw mode, <code>false</code>
420 * @see #setRaw(boolean)
422 public boolean isRaw() {
427 * Enables or disables raw mode. In raw mode all future entities
428 * (messages or body parts) in the stream will be reported to the
429 * {@link ContentHandler#raw(InputStream)} handler method only.
430 * The stream will contain the entire unparsed entity contents
431 * including header fields and whatever is in the body.
433 * @param raw <code>true</code> enables raw mode, <code>false</code>
436 public void setRaw(boolean raw
) {
441 * Finishes the parsing and stops reading lines.
442 * NOTE: No more lines will be parsed but the parser
444 * {@link ContentHandler#endMultipart()},
445 * {@link ContentHandler#endBodyPart()},
446 * {@link ContentHandler#endMessage()}, etc to match previous calls
448 * {@link ContentHandler#startMultipart(BodyDescriptor)},
449 * {@link ContentHandler#startBodyPart()},
450 * {@link ContentHandler#startMessage()}, etc.
453 rootInputStream
.truncate();
457 * Returns the current state.
459 public int getState() {
464 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
465 * @return String with the fields raw contents.
466 * @throws IllegalStateException {@link #getState()} returns another
467 * value than {@link #T_FIELD}.
469 public String
getField() {
470 switch (getState()) {
472 return ((Entity
) currentStateMachine
).field
;
474 throw new IllegalStateException("Expected state to be T_FIELD.");
479 * This method is valid, if {@link #getState()} returns either of
480 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
481 * It returns the raw entity, preamble, or epilogue contents.
482 * @return Data stream, depending on the current state.
483 * @throws IllegalStateException {@link #getState()} returns an
486 public InputStream
getInputStream() {
487 switch (getState()) {
489 return ((RawEntity
) currentStateMachine
).stream
;
493 return ((Entity
) currentStateMachine
).stream
;
495 throw new IllegalStateException("Expected state to be either of T_RAW_ENTITY, T_PREAMBLE, or T_EPILOGUE.");
500 * This method is valid, if {@link #getState()} returns
501 * {@link #T_BODY}, or {@link #T_START_MULTIPART}. It returns the current
502 * entities body descriptor.
504 public BodyDescriptor
getBodyDescriptor() {
505 switch (getState()) {
507 case T_START_MULTIPART
:
508 return ((Entity
) currentStateMachine
).body
;
510 throw new IllegalStateException("Expected state to be T_BODY.");
515 * This method advances the token stream to the next token.
516 * @throws IllegalStateException The method has been called, although
517 * {@link #getState()} was already {@link #T_END_OF_STREAM}.
519 public int next() throws IOException
, MimeException
{
520 if (state
== T_END_OF_STREAM
|| currentStateMachine
== null) {
521 throw new IllegalStateException("No more tokens are available.");
523 while (currentStateMachine
!= null) {
524 state
= currentStateMachine
.next();
525 if (state
!= T_END_OF_STREAM
) {
528 entities
.remove(entities
.size()-1);
529 if (entities
.size() == 0) {
530 currentStateMachine
= null;
532 currentStateMachine
= (StateMachine
) entities
.get(entities
.size()-1);
535 state
= T_END_OF_STREAM
;