2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.poi
.hwpf
.extractor
;
19 import java
.io
.FileInputStream
;
20 import java
.util
.Iterator
;
22 import org
.apache
.poi
.hwpf
.HWPFDocument
;
23 import org
.apache
.poi
.hwpf
.model
.TextPiece
;
24 import org
.apache
.poi
.hwpf
.usermodel
.Paragraph
;
25 import org
.apache
.poi
.hwpf
.usermodel
.Range
;
27 import junit
.framework
.TestCase
;
30 * Test the different routes to extracting text
32 * @author Nick Burch (nick at torchbox dot com)
34 public class TestDifferentRoutes
extends TestCase
{
35 private String
[] p_text
= new String
[] {
36 "This is a simple word document\r",
38 "It has a number of paragraphs in it\r",
40 "Some of them even feature bold, italic and underlined text\r",
43 "This bit is in a different font and size\r",
46 "This bit features some red text.\r",
49 "It is otherwise very very boring.\r"
52 private HWPFDocument doc
;
54 protected void setUp() throws Exception
{
55 String dirname
= System
.getProperty("HWPF.testdata.path");
57 String filename
= dirname
+ "/test2.doc";
58 doc
= new HWPFDocument(new FileInputStream(filename
));
62 * Test model based extraction
64 public void testExtractFromModel() {
65 Range r
= doc
.getRange();
67 String
[] text
= new String
[r
.numParagraphs()];
68 for(int i
=0; i
< r
.numParagraphs(); i
++) {
69 Paragraph p
= r
.getParagraph(i
);
73 assertEquals(p_text
.length
, text
.length
);
74 for(int i
=0; i
<p_text
.length
; i
++) {
75 assertEquals(p_text
[i
], text
[i
]);
80 * Test textPieces based extraction
82 public void testExtractFromTextPieces() throws Exception
{
83 StringBuffer textBuf
= new StringBuffer();
85 Iterator textPieces
= doc
.getTextTable().getTextPieces().iterator();
86 while (textPieces
.hasNext()) {
87 TextPiece piece
= (TextPiece
) textPieces
.next();
89 String encoding
= "Cp1252";
90 if (piece
.usesUnicode()) {
91 encoding
= "UTF-16LE";
93 String text
= new String(piece
.getRawBytes(), encoding
);
97 StringBuffer exp
= new StringBuffer();
98 for(int i
=0; i
<p_text
.length
; i
++) {
99 exp
.append(p_text
[i
]);
101 assertEquals(exp
.toString(), textBuf
.toString());