1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <sfx2/docfile.hxx>
25 #include <sfx2/printer.hxx>
26 #include <sfx2/sfxsids.hrc>
27 #include <editeng/fontitem.hxx>
28 #include <editeng/langitem.hxx>
29 #include <editeng/formatbreakitem.hxx>
30 #include <svl/languageoptions.hxx>
31 #include <shellio.hxx>
33 #include <IDocumentContentOperations.hxx>
34 #include <IDocumentDeviceAccess.hxx>
35 #include <IDocumentStylePoolAccess.hxx>
37 #include <breakit.hxx>
39 #include <strings.hrc>
41 #include <poolfmt.hxx>
42 #include <iodetect.hxx>
44 #include <vcl/metric.hxx>
45 #include <osl/diagnose.h>
47 #define ASC_BUFFLEN 4096
54 std::optional
<SwPaM
> m_oPam
;
56 std::unique_ptr
<char[]> m_pArr
;
57 const SwAsciiOptions
& m_rOpt
;
58 SwAsciiOptions m_usedAsciiOptions
;
59 std::optional
<SfxItemSet
> m_oItemSet
;
60 tools::Long m_nFileSize
;
61 SvtScriptType m_nScript
;
65 void InsertText( const OUString
& rStr
);
67 SwASCIIParser(const SwASCIIParser
&) = delete;
68 SwASCIIParser
& operator=(const SwASCIIParser
&) = delete;
71 SwASCIIParser( SwDoc
& rD
, const SwPaM
& rCursor
, SvStream
& rIn
,
72 bool bReadNewDoc
, const SwAsciiOptions
& rOpts
);
75 const SwAsciiOptions
& GetUsedAsciiOptions() const { return m_usedAsciiOptions
; }
80 // Call for the general reader interface
81 ErrCode
AsciiReader::Read( SwDoc
& rDoc
, const OUString
&, SwPaM
&rPam
, const OUString
& )
85 OSL_ENSURE( false, "ASCII read without a stream" );
86 return ERR_SWG_READ_ERROR
;
91 SwASCIIParser
aParser( rDoc
, rPam
, *m_pStream
,
92 !m_bInsertMode
, m_aOption
.GetASCIIOpts() );
93 nRet
= aParser
.CallParser();
95 OUString optionsString
;
96 aParser
.GetUsedAsciiOptions().WriteUserData(optionsString
);
98 if(m_pMedium
!= nullptr && m_pMedium
->GetItemSet() != nullptr)
99 m_pMedium
->GetItemSet()->Put(SfxStringItem(SID_FILE_FILTEROPTIONS
, optionsString
));
101 // after Read reset the options
102 m_aOption
.ResetASCIIOpts();
106 SwASCIIParser::SwASCIIParser(SwDoc
& rD
, const SwPaM
& rCursor
, SvStream
& rIn
, bool bReadNewDoc
,
107 const SwAsciiOptions
& rOpts
)
111 , m_usedAsciiOptions(rOpts
)
113 , m_nScript(SvtScriptType::NONE
)
114 , m_bNewDoc(bReadNewDoc
)
116 m_oPam
.emplace(*rCursor
.GetPoint());
117 m_pArr
.reset(new char[ASC_BUFFLEN
+ 2]);
120 m_rDoc
.GetAttrPool(),
121 svl::Items
<RES_CHRATR_FONT
, RES_CHRATR_LANGUAGE
, RES_CHRATR_CJK_FONT
,
122 RES_CHRATR_CJK_LANGUAGE
, RES_CHRATR_CTL_FONT
, RES_CHRATR_CTL_LANGUAGE
>);
124 // set defaults from the options
125 if (m_rOpt
.GetLanguage())
127 SvxLanguageItem
aLang(m_rOpt
.GetLanguage(), RES_CHRATR_LANGUAGE
);
128 m_oItemSet
->Put(aLang
);
129 aLang
.SetWhich(RES_CHRATR_CJK_LANGUAGE
);
130 m_oItemSet
->Put(aLang
);
131 aLang
.SetWhich(RES_CHRATR_CTL_LANGUAGE
);
132 m_oItemSet
->Put(aLang
);
134 if (m_rOpt
.GetFontName().isEmpty())
137 vcl::Font
aTextFont(m_rOpt
.GetFontName(), Size(0, 10));
138 if (m_rDoc
.getIDocumentDeviceAccess().getPrinter(false))
139 aTextFont
= m_rDoc
.getIDocumentDeviceAccess().getPrinter(false)->GetFontMetric(aTextFont
);
140 SvxFontItem
aFont( aTextFont
.GetFamilyType(), aTextFont
.GetFamilyName(),
141 OUString(), aTextFont
.GetPitch(), aTextFont
.GetCharSet(), RES_CHRATR_FONT
);
142 m_oItemSet
->Put(aFont
);
143 aFont
.SetWhich(RES_CHRATR_CJK_FONT
);
144 m_oItemSet
->Put(aFont
);
145 aFont
.SetWhich(RES_CHRATR_CTL_FONT
);
146 m_oItemSet
->Put(aFont
);
149 // Calling the parser
150 ErrCode
SwASCIIParser::CallParser()
152 m_rInput
.ResetError();
153 m_nFileSize
= m_rInput
.TellEnd();
154 m_rInput
.Seek(STREAM_SEEK_TO_BEGIN
);
155 m_rInput
.ResetError();
157 ::StartProgress(STR_STATSTR_W4WREAD
, 0, m_nFileSize
, m_rDoc
.GetDocShell());
159 std::optional
<SwPaM
> pInsPam
;
160 sal_Int32 nSttContent
= 0;
163 const SwNode
& rTmp
= m_oPam
->GetPoint()->GetNode();
164 pInsPam
.emplace( rTmp
, rTmp
, SwNodeOffset(0), SwNodeOffset(-1) );
165 nSttContent
= m_oPam
->GetPoint()->GetContentIndex();
168 SwTextFormatColl
*pColl
= nullptr;
172 pColl
= m_rDoc
.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE
,
175 pColl
= m_rDoc
.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD
,
178 m_rDoc
.SetTextFormatColl(*m_oPam
, pColl
);
181 ErrCode nError
= ReadChars();
185 // set only the attribute, for scanned scripts.
186 if (!(SvtScriptType::LATIN
& m_nScript
))
188 m_oItemSet
->ClearItem(RES_CHRATR_FONT
);
189 m_oItemSet
->ClearItem(RES_CHRATR_LANGUAGE
);
191 if (!(SvtScriptType::ASIAN
& m_nScript
))
193 m_oItemSet
->ClearItem(RES_CHRATR_CJK_FONT
);
194 m_oItemSet
->ClearItem(RES_CHRATR_CJK_LANGUAGE
);
196 if (!(SvtScriptType::COMPLEX
& m_nScript
))
198 m_oItemSet
->ClearItem(RES_CHRATR_CTL_FONT
);
199 m_oItemSet
->ClearItem(RES_CHRATR_CTL_LANGUAGE
);
201 if (m_oItemSet
->Count())
207 // Using the pool defaults for the font causes significant
208 // trouble for the HTML filter, because it is not able
209 // to export the pool defaults (or to be more precise:
210 // the HTML filter is not able to detect whether a pool
211 // default has changed or not. Even a comparison with the
212 // HTML template does not work, because the defaults are
213 // not copied when a new doc is created. The result of
214 // comparing pool defaults therefore would be that the
215 // defaults are exported always if the have changed for
216 // text documents in general. That's not sensible, as well
217 // as it is not sensible to export them always.
218 sal_uInt16 aWhichIds
[4] =
220 RES_CHRATR_FONT
, RES_CHRATR_CJK_FONT
,
221 RES_CHRATR_CTL_FONT
, 0
223 sal_uInt16
*pWhichIds
= aWhichIds
;
226 const SfxPoolItem
*pItem
;
227 if (SfxItemState::SET
228 == m_oItemSet
->GetItemState(*pWhichIds
, false, &pItem
))
230 pColl
->SetFormatAttr( *pItem
);
231 m_oItemSet
->ClearItem(*pWhichIds
);
236 if (m_oItemSet
->Count())
237 m_rDoc
.SetDefault(*m_oItemSet
);
241 // then set over the insert range the defined attributes
242 *pInsPam
->GetMark() = *m_oPam
->GetPoint();
243 pInsPam
->GetPoint()->Assign(pInsPam
->GetPoint()->GetNode(), SwNodeOffset(1),
247 OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
248 m_rDoc
.getIDocumentContentOperations().InsertItemSet(*pInsPam
, *m_oItemSet
);
256 ::EndProgress(m_rDoc
.GetDocShell());
260 ErrCode
SwASCIIParser::ReadChars()
262 sal_Unicode
*pStt
= nullptr, *pEnd
= nullptr, *pLastStt
= nullptr;
263 tools::Long nReadCnt
= 0, nLineLen
= 0;
264 sal_Unicode cLastCR
= 0;
265 bool bSwapUnicode
= false;
267 const SwAsciiOptions
* pUseMe
= &m_rOpt
;
268 SwAsciiOptions aEmpty
;
269 if (m_nFileSize
>= 2 && aEmpty
.GetFontName() == m_rOpt
.GetFontName()
270 && aEmpty
.GetCharSet() == m_rOpt
.GetCharSet()
271 && aEmpty
.GetLanguage() == m_rOpt
.GetLanguage()
272 && aEmpty
.GetParaFlags() == m_rOpt
.GetParaFlags())
274 sal_Size nLen
, nOrig
;
275 nOrig
= nLen
= m_rInput
.ReadBytes(m_pArr
.get(), ASC_BUFFLEN
);
276 rtl_TextEncoding eCharSet
;
280 = SwIoSystem::IsDetectableText(m_pArr
.get(), nLen
, &eCharSet
,
281 &bSwapUnicode
, &eLineEnd
, &bHasBom
);
283 return ERRCODE_IO_BROKENPACKAGE
;
285 OSL_ENSURE(bRet
, "Autodetect of text import without nag dialog must have failed");
286 if (bRet
&& eCharSet
!= RTL_TEXTENCODING_DONTKNOW
)
288 aEmpty
.SetCharSet(eCharSet
);
289 aEmpty
.SetParaFlags(eLineEnd
);
290 aEmpty
.SetIncludeBOM(bHasBom
);
291 m_rInput
.SeekRel(-(tools::Long(nLen
)));
294 m_rInput
.SeekRel(-(tools::Long(nOrig
)));
297 m_usedAsciiOptions
= *pUseMe
;
299 rtl_TextToUnicodeConverter hConverter
=nullptr;
300 rtl_TextToUnicodeContext hContext
=nullptr;
301 rtl_TextEncoding currentCharSet
= pUseMe
->GetCharSet();
302 if (RTL_TEXTENCODING_UCS2
!= currentCharSet
)
304 if( currentCharSet
== RTL_TEXTENCODING_DONTKNOW
)
305 currentCharSet
= RTL_TEXTENCODING_ASCII_US
;
306 hConverter
= rtl_createTextToUnicodeConverter( currentCharSet
);
307 OSL_ENSURE( hConverter
, "no string convert available" );
309 return ErrCode(ErrCodeArea::Sw
, ErrCodeClass::Read
, 0);
310 bSwapUnicode
= false;
311 hContext
= rtl_createTextToUnicodeContext( hConverter
);
313 else if (pUseMe
!= &aEmpty
) //Already successfully figured out type
315 m_rInput
.StartReadingUnicodeText(currentCharSet
);
316 bSwapUnicode
= m_rInput
.IsEndianSwap();
319 std::unique_ptr
<sal_Unicode
[]> aWork
;
320 sal_Size nArrOffset
= 0;
325 if( pLastStt
!= pStt
)
326 InsertText( OUString( pLastStt
));
330 if (ERRCODE_NONE
!= m_rInput
.GetError()
332 == (lGCount
= m_rInput
.ReadBytes(m_pArr
.get() + nArrOffset
,
333 ASC_BUFFLEN
- nArrOffset
)))
334 break; // break from the while loop
337 If there was some unconverted bytes on the last cycle then they
338 were put at the beginning of the array, so total bytes available
339 to convert this cycle includes them. If we found 0 following bytes
340 then we ignore the previous partial character.
342 lGCount
+= nArrOffset
;
347 sal_Size nNewLen
= lGCount
, nCntBytes
;
348 aWork
.reset(new sal_Unicode
[nNewLen
+ 1]); // add 1 for '\0'
349 sal_Unicode
* pBuf
= aWork
.get();
350 pBuf
[nNewLen
] = 0; // ensure '\0'
352 nNewLen
= rtl_convertTextToUnicode(hConverter
, hContext
, m_pArr
.get(), lGCount
,
354 (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
355 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
356 | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
357 | RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
),
359 nArrOffset
= lGCount
- nCntBytes
;
360 if( 0 != nArrOffset
)
361 memmove(m_pArr
.get(), m_pArr
.get() + nCntBytes
, nArrOffset
);
363 pStt
= pLastStt
= aWork
.get();
364 pEnd
= pStt
+ nNewLen
;
368 pStt
= pLastStt
= reinterpret_cast<sal_Unicode
*>(m_pArr
.get());
369 auto nChars
= lGCount
/ 2;
370 pEnd
= pStt
+ nChars
;
374 char *pF
= m_pArr
.get(), *pN
= m_pArr
.get() + 1;
375 for (sal_Size n
= 0; n
< nChars
; ++n
, pF
+= 2, pN
+= 2)
387 ::SetProgressState(nReadCnt
, m_rDoc
.GetDocShell());
391 if( 0x0a == *pStt
&& 0x0d == cLastCR
)
395 // We skip the last one at the end
396 if (!m_rInput
.eof() || !(pEnd
== pStt
|| (!*pEnd
&& pEnd
== pStt
+ 1)))
397 m_rDoc
.getIDocumentContentOperations().SplitNode(*m_oPam
->GetPoint(), false);
401 bool bIns
= true, bSplitNode
= false;
405 case 0x0a: if( LINEEND_LF
== pUseMe
->GetParaFlags() )
411 // We skip the last one at the end
412 if (!m_rInput
.eof() || pEnd
!= pStt
)
417 case 0x0d: if( LINEEND_LF
!= pUseMe
->GetParaFlags() )
423 bool bChkSplit
= true;
424 if( LINEEND_CRLF
== pUseMe
->GetParaFlags() )
431 else if( 0x0a == *pStt
)
435 // We skip the last one at the end
436 if (bChkSplit
&& (!m_rInput
.eof() || pEnd
!= pStt
))
443 // Insert a hard page break
447 InsertText( OUString( pLastStt
));
449 m_rDoc
.getIDocumentContentOperations().SplitNode(*m_oPam
->GetPoint(),
451 m_rDoc
.getIDocumentContentOperations().InsertPoolItem(
452 *m_oPam
, SvxFormatBreakItem(SvxBreak::PageBefore
, RES_BREAK
));
460 if (nReadCnt
== m_nFileSize
&& pStt
+ 1 == pEnd
)
463 *pStt
= '#'; // Replacement visualisation
470 // Found control char, replace with '#'
477 if( ( nLineLen
>= MAX_ASCII_PARA
- 100 ) &&
478 ( ( *pStt
== ' ' ) || ( nLineLen
>= MAX_ASCII_PARA
- 1 ) ) )
480 sal_Unicode c
= *pStt
;
482 InsertText( OUString( pLastStt
));
483 m_rDoc
.getIDocumentContentOperations().SplitNode(*m_oPam
->GetPoint(), false);
491 else if( bSplitNode
)
493 // We found a CR/LF, thus save the text
494 InsertText( OUString( pLastStt
));
496 m_rDoc
.getIDocumentContentOperations().AppendTextNode(*m_oPam
->GetPoint());
498 m_rDoc
.getIDocumentContentOperations().SplitNode(*m_oPam
->GetPoint(), false);
506 rtl_destroyTextToUnicodeContext( hConverter
, hContext
);
507 rtl_destroyTextToUnicodeConverter( hConverter
);
512 void SwASCIIParser::InsertText( const OUString
& rStr
)
514 m_rDoc
.getIDocumentContentOperations().InsertString(*m_oPam
, rStr
);
516 if (m_oItemSet
&& g_pBreakIt
517 && m_nScript
!= (SvtScriptType::LATIN
| SvtScriptType::ASIAN
| SvtScriptType::COMPLEX
))
518 m_nScript
|= g_pBreakIt
->GetAllScriptsOfText(rStr
);
521 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */