Pull parser patch https://issues.apache.org/jira/browse/MIME4J-19. Contributed by...
[mime4j.git] / src / main / java / org / apache / james / mime4j / MimeTokenStream.java
blob8aa0b9978cfd8040a42b72427af7e3ce4f1451ac
1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
20 package org.apache.james.mime4j;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.BitSet;
26 import java.util.List;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.james.mime4j.decoder.Base64InputStream;
31 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
34 /**
35 * <p>
36 * Parses MIME (or RFC822) message streams of bytes or characters.
37 * The stream is converted into an event stream.
38 * <p>
39 * <p>
40 * Typical usage:
41 * </p>
42 * <pre>
43 * MimeTokenStream stream = new MimeTokenStream();
44 * stream.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
45 * for (int state = stream.getState();
46 * state != MimeTokenStream.T_END_OF_STREAM;
47 * state = stream.next()) {
48 * switch (state) {
49 * case MimeTokenStream.T_BODY:
50 * System.out.println("Body detected, contents = "
51 * + stream.getInputStream() + ", header data = "
52 * + stream.getBodyDescriptor());
53 * break;
54 * case MimeTokenStream.T_FIELD:
55 * System.out.println("Header field detected: "
56 * + stream.getField());
57 * break;
58 * case MimeTokenStream.T_START_MULTIPART:
59 * System.out.println("Multipart message detexted,"
60 * + " header data = "
61 * + stream.getBodyDescriptor());
62 * ...
63 * }
64 * }
65 * </pre>
66 * <p>
67 * <strong>NOTE:</strong> All lines must end with CRLF
68 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
69 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream}
70 * instance.</p>
71 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
72 * method {@link #parse(InputStream)} resets the token streams internal
73 * state. However, they are definitely <em>not</em> thread safe. If you
74 * have a multi threaded application, then the suggested use is to have
75 * one instance per thread.</p>
77 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
79 public class MimeTokenStream {
80 private static final Log log = LogFactory.getLog(MimeStreamParser.class);
82 /**
83 * This token indicates, that the MIME stream has been completely
84 * and successfully parsed, and no more data is available.
86 public static final int T_END_OF_STREAM = -1;
87 /**
88 * This token indicates, that the MIME stream is currently
89 * at the beginning of a message.
91 public static final int T_START_MESSAGE = 0;
92 /**
93 * This token indicates, that the MIME stream is currently
94 * at the end of a message.
96 public static final int T_END_MESSAGE = 1;
97 /**
98 * This token indicates, that a raw entity is currently being processed.
99 * You may call {@link #getInputStream()} to obtain the raw entity
100 * data.
102 public static final int T_RAW_ENTITY = 2;
104 * This token indicates, that a message parts headers are now
105 * being parsed.
107 public static final int T_START_HEADER = 3;
109 * This token indicates, that a message parts field has now
110 * been parsed. You may call {@link #getField()} to obtain the
111 * raw field contents.
113 public static final int T_FIELD = 4;
115 * This token indicates, that part headers have now been
116 * parsed.
118 public static final int T_END_HEADER = 5;
120 * This token indicates, that a multipart body is being parsed.
122 public static final int T_START_MULTIPART = 6;
124 * This token indicates, that a multipart body has been parsed.
126 public static final int T_END_MULTIPART = 7;
128 * This token indicates, that a multiparts preamble is being
129 * parsed. You may call {@link #getInputStream()} to access the
130 * preamble contents.
132 public static final int T_PREAMBLE = 8;
134 * This token indicates, that a multiparts epilogue is being
135 * parsed. You may call {@link #getInputStream()} to access the
136 * epilogue contents.
138 public static final int T_EPILOGUE = 9;
140 * This token indicates, that the MIME stream is currently
141 * at the beginning of a body part.
143 public static final int T_START_BODYPART = 10;
145 * This token indicates, that the MIME stream is currently
146 * at the end of a body part.
148 public static final int T_END_BODYPART = 11;
150 * This token indicates, that an atomic entity is being parsed.
151 * Use {@link #getInputStream()} to access the entity contents.
153 public static final int T_BODY = 12;
155 * Internal state, not exposed.
157 private static final int T_IN_BODYPART = -2;
159 * Internal state, not exposed.
161 private static final int T_IN_MESSAGE = -3;
163 private static final BitSet fieldChars = new BitSet();
164 static {
165 for (int i = 0x21; i <= 0x39; i++) {
166 fieldChars.set(i);
168 for (int i = 0x3b; i <= 0x7e; i++) {
169 fieldChars.set(i);
173 abstract static class StateMachine {
174 int state;
175 abstract int next() throws IOException, MimeException;
178 private static class RawEntity extends StateMachine {
179 private InputStream stream;
180 RawEntity(InputStream stream) {
181 this.stream = stream;
182 state = T_RAW_ENTITY;
184 int next() {
185 state = T_END_OF_STREAM;
186 return state;
190 private abstract class Entity extends StateMachine {
191 private final BodyDescriptor parent;
192 private final InputStream contents;
193 private final StringBuffer sb = new StringBuffer();
194 private BodyDescriptor body;
195 private int pos, start;
196 private int lineNumber, startLineNumber;
197 private final int endState;
198 private MimeBoundaryInputStream mbis;
199 InputStream stream;
200 String field;
202 Entity(InputStream contents, BodyDescriptor parent, int startState, int endState) {
203 this.parent = parent;
204 this.contents = contents;
205 state = startState;
206 this.endState = endState;
209 private void setParsingFieldState() {
210 state = parseField() ? T_FIELD : T_END_HEADER;
213 private int setParseBodyPartState() throws IOException {
214 mbis.consume();
215 if (mbis.parentEOF()) {
216 if (log.isWarnEnabled()) {
217 log.warn("Line " + rootInputStream.getLineNumber()
218 + ": Body part ended prematurely. "
219 + "Higher level boundary detected or "
220 + "EOF reached.");
222 } else {
223 if (mbis.hasMoreParts()) {
224 mbis = new MimeBoundaryInputStream(contents, body.getBoundary());
225 if (isRaw()) {
226 currentStateMachine = new RawEntity(mbis);
227 } else {
228 currentStateMachine = new BodyPart(mbis, body);
230 entities.add(currentStateMachine);
231 state = T_IN_BODYPART;
232 return currentStateMachine.state;
235 state = T_EPILOGUE;
236 stream = new CloseShieldInputStream(contents);
237 return T_EPILOGUE;
240 int next() throws IOException, MimeException {
241 switch (state) {
242 case T_START_MESSAGE:
243 case T_START_BODYPART:
244 state = T_START_HEADER;
245 break;
246 case T_START_HEADER:
247 initHeaderParsing();
248 setParsingFieldState();
249 break;
250 case T_FIELD:
251 setParsingFieldState();
252 break;
253 case T_END_HEADER:
254 if (body.isMultipart()) {
255 state = T_START_MULTIPART;
256 } else if (body.isMessage()) {
257 InputStream is = contents;
258 if (body.isBase64Encoded()) {
259 log.warn("base64 encoded message/rfc822 detected");
260 is = new EOLConvertingInputStream(new Base64InputStream(contents));
261 } else if (body.isQuotedPrintableEncoded()) {
262 log.warn("quoted-printable encoded message/rfc822 detected");
263 is = new EOLConvertingInputStream(new QuotedPrintableInputStream(contents));
265 state = endState;
266 return parseMessage(is, body);
267 } else {
268 stream = new CloseShieldInputStream(contents);
269 state = T_BODY;
270 break;
272 break;
273 case T_START_MULTIPART:
274 mbis = new MimeBoundaryInputStream(contents, body.getBoundary());
275 stream = new CloseShieldInputStream(mbis);
276 state = T_PREAMBLE;
277 break;
278 case T_PREAMBLE:
279 return setParseBodyPartState();
280 case T_IN_BODYPART:
281 return setParseBodyPartState();
282 case T_EPILOGUE:
283 state = T_END_MULTIPART;
284 break;
285 case T_BODY:
286 case T_END_MULTIPART:
287 case T_IN_MESSAGE:
288 state = endState;
289 break;
290 default:
291 if (state == endState) {
292 state = T_END_OF_STREAM;
293 break;
295 throw new IllegalStateException("Invalid state: " + state);
297 return state;
300 private void initHeaderParsing() throws IOException {
301 body = new BodyDescriptor(parent);
302 startLineNumber = lineNumber = rootInputStream.getLineNumber();
304 int curr = 0;
305 int prev = 0;
306 while ((curr = contents.read()) != -1) {
307 if (curr == '\n' && (prev == '\n' || prev == 0)) {
309 * [\r]\n[\r]\n or an immediate \r\n have been seen.
311 sb.deleteCharAt(sb.length() - 1);
312 break;
314 sb.append((char) curr);
315 prev = curr == '\r' ? prev : curr;
318 if (curr == -1 && log.isWarnEnabled()) {
319 log.warn("Line " + rootInputStream.getLineNumber()
320 + ": Unexpected end of headers detected. "
321 + "Boundary detected in header or EOF reached.");
325 private boolean parseField() {
326 while (pos < sb.length()) {
327 while (pos < sb.length() && sb.charAt(pos) != '\r') {
328 pos++;
330 if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
331 pos++;
332 continue;
334 if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
336 * field should be the complete field data excluding the
337 * trailing \r\n.
339 field = sb.substring(start, pos);
340 start = pos + 2;
343 * Check for a valid field.
345 int index = field.indexOf(':');
346 boolean valid = false;
347 if (index != -1 && fieldChars.get(field.charAt(0))) {
348 valid = true;
349 String fieldName = field.substring(0, index).trim();
350 for (int i = 0; i < fieldName.length(); i++) {
351 if (!fieldChars.get(fieldName.charAt(i))) {
352 valid = false;
353 break;
356 if (valid) {
357 body.addField(fieldName, field.substring(index + 1));
358 startLineNumber = lineNumber;
359 pos += 2;
360 lineNumber++;
361 return true;
364 if (log.isWarnEnabled()) {
365 log.warn("Line " + startLineNumber
366 + ": Ignoring invalid field: '" + field.trim() + "'");
368 startLineNumber = lineNumber;
370 pos += 2;
371 lineNumber++;
373 return false;
377 private class Message extends Entity {
378 Message(InputStream contents, BodyDescriptor parent) {
379 super(contents, parent, T_START_MESSAGE, T_END_MESSAGE);
383 private class BodyPart extends Entity {
384 BodyPart(InputStream contents, BodyDescriptor parent) {
385 super(contents, parent, T_START_BODYPART, T_END_BODYPART);
389 private int state = T_END_OF_STREAM;
390 private RootInputStream rootInputStream;
391 private StateMachine currentStateMachine;
392 private final List entities = new ArrayList();
393 private boolean raw;
395 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
396 * If the {@code MimeTokenStream} has already been in use, resets the streams
397 * internal state.
399 public void parse(InputStream stream) {
400 entities.clear();
401 rootInputStream = new RootInputStream(stream);
402 state = parseMessage(rootInputStream, null);
405 private int parseMessage(InputStream pStream, BodyDescriptor parent) {
406 if (isRaw()) {
407 currentStateMachine = new RawEntity(pStream);
408 } else {
409 currentStateMachine = new Message(pStream, parent);
411 entities.add(currentStateMachine);
412 return currentStateMachine.state;
416 * Determines if this parser is currently in raw mode.
418 * @return <code>true</code> if in raw mode, <code>false</code>
419 * otherwise.
420 * @see #setRaw(boolean)
422 public boolean isRaw() {
423 return raw;
427 * Enables or disables raw mode. In raw mode all future entities
428 * (messages or body parts) in the stream will be reported to the
429 * {@link ContentHandler#raw(InputStream)} handler method only.
430 * The stream will contain the entire unparsed entity contents
431 * including header fields and whatever is in the body.
433 * @param raw <code>true</code> enables raw mode, <code>false</code>
434 * disables it.
436 public void setRaw(boolean raw) {
437 this.raw = raw;
441 * Finishes the parsing and stops reading lines.
442 * NOTE: No more lines will be parsed but the parser
443 * will still call
444 * {@link ContentHandler#endMultipart()},
445 * {@link ContentHandler#endBodyPart()},
446 * {@link ContentHandler#endMessage()}, etc to match previous calls
447 * to
448 * {@link ContentHandler#startMultipart(BodyDescriptor)},
449 * {@link ContentHandler#startBodyPart()},
450 * {@link ContentHandler#startMessage()}, etc.
452 public void stop() {
453 rootInputStream.truncate();
457 * Returns the current state.
459 public int getState() {
460 return state;
464 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
465 * @return String with the fields raw contents.
466 * @throws IllegalStateException {@link #getState()} returns another
467 * value than {@link #T_FIELD}.
469 public String getField() {
470 switch (getState()) {
471 case T_FIELD:
472 return ((Entity) currentStateMachine).field;
473 default:
474 throw new IllegalStateException("Expected state to be T_FIELD.");
479 * This method is valid, if {@link #getState()} returns either of
480 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
481 * It returns the raw entity, preamble, or epilogue contents.
482 * @return Data stream, depending on the current state.
483 * @throws IllegalStateException {@link #getState()} returns an
484 * invalid value.
486 public InputStream getInputStream() {
487 switch (getState()) {
488 case T_RAW_ENTITY:
489 return ((RawEntity) currentStateMachine).stream;
490 case T_PREAMBLE:
491 case T_EPILOGUE:
492 case T_BODY:
493 return ((Entity) currentStateMachine).stream;
494 default:
495 throw new IllegalStateException("Expected state to be either of T_RAW_ENTITY, T_PREAMBLE, or T_EPILOGUE.");
500 * This method is valid, if {@link #getState()} returns
501 * {@link #T_BODY}, or {@link #T_START_MULTIPART}. It returns the current
502 * entities body descriptor.
504 public BodyDescriptor getBodyDescriptor() {
505 switch (getState()) {
506 case T_BODY:
507 case T_START_MULTIPART:
508 return ((Entity) currentStateMachine).body;
509 default:
510 throw new IllegalStateException("Expected state to be T_BODY.");
515 * This method advances the token stream to the next token.
516 * @throws IllegalStateException The method has been called, although
517 * {@link #getState()} was already {@link #T_END_OF_STREAM}.
519 public int next() throws IOException, MimeException {
520 if (state == T_END_OF_STREAM || currentStateMachine == null) {
521 throw new IllegalStateException("No more tokens are available.");
523 while (currentStateMachine != null) {
524 state = currentStateMachine.next();
525 if (state != T_END_OF_STREAM) {
526 return state;
528 entities.remove(entities.size()-1);
529 if (entities.size() == 0) {
530 currentStateMachine = null;
531 } else {
532 currentStateMachine = (StateMachine) entities.get(entities.size()-1);
535 state = T_END_OF_STREAM;
536 return state;