Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sw / source / filter / ascii / parasc.cxx
blobb29251bcbd8b855a8ba736cb54c2d057a4d18ef4
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <memory>
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <sfx2/docfile.hxx>
25 #include <sfx2/printer.hxx>
26 #include <sfx2/sfxsids.hrc>
27 #include <editeng/fontitem.hxx>
28 #include <editeng/langitem.hxx>
29 #include <editeng/formatbreakitem.hxx>
30 #include <svl/languageoptions.hxx>
31 #include <shellio.hxx>
32 #include <doc.hxx>
33 #include <IDocumentContentOperations.hxx>
34 #include <IDocumentDeviceAccess.hxx>
35 #include <IDocumentStylePoolAccess.hxx>
36 #include <pam.hxx>
37 #include <breakit.hxx>
38 #include <swerror.h>
39 #include <strings.hrc>
40 #include <mdiexp.hxx>
41 #include <poolfmt.hxx>
42 #include <iodetect.hxx>
44 #include <vcl/metric.hxx>
45 #include <osl/diagnose.h>
47 #define ASC_BUFFLEN 4096
49 namespace {
51 class SwASCIIParser
53 SwDoc& m_rDoc;
54 std::optional<SwPaM> m_oPam;
55 SvStream& m_rInput;
56 std::unique_ptr<char[]> m_pArr;
57 const SwAsciiOptions& m_rOpt;
58 SwAsciiOptions m_usedAsciiOptions;
59 std::optional<SfxItemSet> m_oItemSet;
60 tools::Long m_nFileSize;
61 SvtScriptType m_nScript;
62 bool m_bNewDoc;
64 ErrCode ReadChars();
65 void InsertText( const OUString& rStr );
67 SwASCIIParser(const SwASCIIParser&) = delete;
68 SwASCIIParser& operator=(const SwASCIIParser&) = delete;
70 public:
71 SwASCIIParser( SwDoc& rD, const SwPaM& rCursor, SvStream& rIn,
72 bool bReadNewDoc, const SwAsciiOptions& rOpts );
74 ErrCode CallParser();
75 const SwAsciiOptions& GetUsedAsciiOptions() const { return m_usedAsciiOptions; }
80 // Call for the general reader interface
81 ErrCode AsciiReader::Read( SwDoc& rDoc, const OUString&, SwPaM &rPam, const OUString & )
83 if( !m_pStream )
85 OSL_ENSURE( false, "ASCII read without a stream" );
86 return ERR_SWG_READ_ERROR;
89 ErrCode nRet;
91 SwASCIIParser aParser( rDoc, rPam, *m_pStream,
92 !m_bInsertMode, m_aOption.GetASCIIOpts() );
93 nRet = aParser.CallParser();
95 OUString optionsString;
96 aParser.GetUsedAsciiOptions().WriteUserData(optionsString);
98 if(m_pMedium != nullptr && m_pMedium->GetItemSet() != nullptr)
99 m_pMedium->GetItemSet()->Put(SfxStringItem(SID_FILE_FILTEROPTIONS, optionsString));
101 // after Read reset the options
102 m_aOption.ResetASCIIOpts();
103 return nRet;
106 SwASCIIParser::SwASCIIParser(SwDoc& rD, const SwPaM& rCursor, SvStream& rIn, bool bReadNewDoc,
107 const SwAsciiOptions& rOpts)
108 : m_rDoc(rD)
109 , m_rInput(rIn)
110 , m_rOpt(rOpts)
111 , m_usedAsciiOptions(rOpts)
112 , m_nFileSize(0)
113 , m_nScript(SvtScriptType::NONE)
114 , m_bNewDoc(bReadNewDoc)
116 m_oPam.emplace(*rCursor.GetPoint());
117 m_pArr.reset(new char[ASC_BUFFLEN + 2]);
119 m_oItemSet.emplace(
120 m_rDoc.GetAttrPool(),
121 svl::Items<RES_CHRATR_FONT, RES_CHRATR_LANGUAGE, RES_CHRATR_CJK_FONT,
122 RES_CHRATR_CJK_LANGUAGE, RES_CHRATR_CTL_FONT, RES_CHRATR_CTL_LANGUAGE>);
124 // set defaults from the options
125 if (m_rOpt.GetLanguage())
127 SvxLanguageItem aLang(m_rOpt.GetLanguage(), RES_CHRATR_LANGUAGE);
128 m_oItemSet->Put(aLang);
129 aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
130 m_oItemSet->Put(aLang);
131 aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
132 m_oItemSet->Put(aLang);
134 if (m_rOpt.GetFontName().isEmpty())
135 return;
137 vcl::Font aTextFont(m_rOpt.GetFontName(), Size(0, 10));
138 if (m_rDoc.getIDocumentDeviceAccess().getPrinter(false))
139 aTextFont = m_rDoc.getIDocumentDeviceAccess().getPrinter(false)->GetFontMetric(aTextFont);
140 SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
141 OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
142 m_oItemSet->Put(aFont);
143 aFont.SetWhich(RES_CHRATR_CJK_FONT);
144 m_oItemSet->Put(aFont);
145 aFont.SetWhich(RES_CHRATR_CTL_FONT);
146 m_oItemSet->Put(aFont);
149 // Calling the parser
150 ErrCode SwASCIIParser::CallParser()
152 m_rInput.ResetError();
153 m_nFileSize = m_rInput.TellEnd();
154 m_rInput.Seek(STREAM_SEEK_TO_BEGIN);
155 m_rInput.ResetError();
157 ::StartProgress(STR_STATSTR_W4WREAD, 0, m_nFileSize, m_rDoc.GetDocShell());
159 std::optional<SwPaM> pInsPam;
160 sal_Int32 nSttContent = 0;
161 if (!m_bNewDoc)
163 const SwNode& rTmp = m_oPam->GetPoint()->GetNode();
164 pInsPam.emplace( rTmp, rTmp, SwNodeOffset(0), SwNodeOffset(-1) );
165 nSttContent = m_oPam->GetPoint()->GetContentIndex();
168 SwTextFormatColl *pColl = nullptr;
170 if (m_bNewDoc)
172 pColl = m_rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE,
173 false);
174 if (!pColl)
175 pColl = m_rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD,
176 false);
177 if (pColl)
178 m_rDoc.SetTextFormatColl(*m_oPam, pColl);
181 ErrCode nError = ReadChars();
183 if (m_oItemSet)
185 // set only the attribute, for scanned scripts.
186 if (!(SvtScriptType::LATIN & m_nScript))
188 m_oItemSet->ClearItem(RES_CHRATR_FONT);
189 m_oItemSet->ClearItem(RES_CHRATR_LANGUAGE);
191 if (!(SvtScriptType::ASIAN & m_nScript))
193 m_oItemSet->ClearItem(RES_CHRATR_CJK_FONT);
194 m_oItemSet->ClearItem(RES_CHRATR_CJK_LANGUAGE);
196 if (!(SvtScriptType::COMPLEX & m_nScript))
198 m_oItemSet->ClearItem(RES_CHRATR_CTL_FONT);
199 m_oItemSet->ClearItem(RES_CHRATR_CTL_LANGUAGE);
201 if (m_oItemSet->Count())
203 if (m_bNewDoc)
205 if (pColl)
207 // Using the pool defaults for the font causes significant
208 // trouble for the HTML filter, because it is not able
209 // to export the pool defaults (or to be more precise:
210 // the HTML filter is not able to detect whether a pool
211 // default has changed or not. Even a comparison with the
212 // HTML template does not work, because the defaults are
213 // not copied when a new doc is created. The result of
214 // comparing pool defaults therefore would be that the
215 // defaults are exported always if the have changed for
216 // text documents in general. That's not sensible, as well
217 // as it is not sensible to export them always.
218 sal_uInt16 aWhichIds[4] =
220 RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
221 RES_CHRATR_CTL_FONT, 0
223 sal_uInt16 *pWhichIds = aWhichIds;
224 while (*pWhichIds)
226 const SfxPoolItem *pItem;
227 if (SfxItemState::SET
228 == m_oItemSet->GetItemState(*pWhichIds, false, &pItem))
230 pColl->SetFormatAttr( *pItem );
231 m_oItemSet->ClearItem(*pWhichIds);
233 ++pWhichIds;
236 if (m_oItemSet->Count())
237 m_rDoc.SetDefault(*m_oItemSet);
239 else if( pInsPam )
241 // then set over the insert range the defined attributes
242 *pInsPam->GetMark() = *m_oPam->GetPoint();
243 pInsPam->GetPoint()->Assign(pInsPam->GetPoint()->GetNode(), SwNodeOffset(1),
244 nSttContent );
246 // !!!!!
247 OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
248 m_rDoc.getIDocumentContentOperations().InsertItemSet(*pInsPam, *m_oItemSet);
251 m_oItemSet.reset();
254 pInsPam.reset();
256 ::EndProgress(m_rDoc.GetDocShell());
257 return nError;
260 ErrCode SwASCIIParser::ReadChars()
262 sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
263 tools::Long nReadCnt = 0, nLineLen = 0;
264 sal_Unicode cLastCR = 0;
265 bool bSwapUnicode = false;
267 const SwAsciiOptions* pUseMe = &m_rOpt;
268 SwAsciiOptions aEmpty;
269 if (m_nFileSize >= 2 && aEmpty.GetFontName() == m_rOpt.GetFontName()
270 && aEmpty.GetCharSet() == m_rOpt.GetCharSet()
271 && aEmpty.GetLanguage() == m_rOpt.GetLanguage()
272 && aEmpty.GetParaFlags() == m_rOpt.GetParaFlags())
274 sal_Size nLen, nOrig;
275 nOrig = nLen = m_rInput.ReadBytes(m_pArr.get(), ASC_BUFFLEN);
276 rtl_TextEncoding eCharSet;
277 LineEnd eLineEnd;
278 bool bHasBom;
279 const bool bRet
280 = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet,
281 &bSwapUnicode, &eLineEnd, &bHasBom);
282 if (!bRet)
283 return ERRCODE_IO_BROKENPACKAGE;
285 OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
286 if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
288 aEmpty.SetCharSet(eCharSet);
289 aEmpty.SetParaFlags(eLineEnd);
290 aEmpty.SetIncludeBOM(bHasBom);
291 m_rInput.SeekRel(-(tools::Long(nLen)));
293 else
294 m_rInput.SeekRel(-(tools::Long(nOrig)));
295 pUseMe=&aEmpty;
297 m_usedAsciiOptions = *pUseMe;
299 rtl_TextToUnicodeConverter hConverter=nullptr;
300 rtl_TextToUnicodeContext hContext=nullptr;
301 rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
302 if (RTL_TEXTENCODING_UCS2 != currentCharSet)
304 if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
305 currentCharSet = RTL_TEXTENCODING_ASCII_US;
306 hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
307 OSL_ENSURE( hConverter, "no string convert available" );
308 if (!hConverter)
309 return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
310 bSwapUnicode = false;
311 hContext = rtl_createTextToUnicodeContext( hConverter );
313 else if (pUseMe != &aEmpty) //Already successfully figured out type
315 m_rInput.StartReadingUnicodeText(currentCharSet);
316 bSwapUnicode = m_rInput.IsEndianSwap();
319 std::unique_ptr<sal_Unicode[]> aWork;
320 sal_Size nArrOffset = 0;
322 do {
323 if( pStt >= pEnd )
325 if( pLastStt != pStt )
326 InsertText( OUString( pLastStt ));
328 // Read a new block
329 sal_Size lGCount;
330 if (ERRCODE_NONE != m_rInput.GetError()
331 || 0
332 == (lGCount = m_rInput.ReadBytes(m_pArr.get() + nArrOffset,
333 ASC_BUFFLEN - nArrOffset)))
334 break; // break from the while loop
337 If there was some unconverted bytes on the last cycle then they
338 were put at the beginning of the array, so total bytes available
339 to convert this cycle includes them. If we found 0 following bytes
340 then we ignore the previous partial character.
342 lGCount += nArrOffset;
344 if( hConverter )
346 sal_uInt32 nInfo;
347 sal_Size nNewLen = lGCount, nCntBytes;
348 aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
349 sal_Unicode* pBuf = aWork.get();
350 pBuf[nNewLen] = 0; // ensure '\0'
352 nNewLen = rtl_convertTextToUnicode(hConverter, hContext, m_pArr.get(), lGCount,
353 pBuf, nNewLen,
354 (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
355 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
356 | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
357 | RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE),
358 &nInfo, &nCntBytes);
359 nArrOffset = lGCount - nCntBytes;
360 if( 0 != nArrOffset )
361 memmove(m_pArr.get(), m_pArr.get() + nCntBytes, nArrOffset);
363 pStt = pLastStt = aWork.get();
364 pEnd = pStt + nNewLen;
366 else
368 pStt = pLastStt = reinterpret_cast<sal_Unicode*>(m_pArr.get());
369 auto nChars = lGCount / 2;
370 pEnd = pStt + nChars;
372 if( bSwapUnicode )
374 char *pF = m_pArr.get(), *pN = m_pArr.get() + 1;
375 for (sal_Size n = 0; n < nChars; ++n, pF += 2, pN += 2)
377 char c = *pF;
378 *pF = *pN;
379 *pN = c;
384 *pEnd = 0;
385 nReadCnt += lGCount;
387 ::SetProgressState(nReadCnt, m_rDoc.GetDocShell());
389 if( cLastCR )
391 if( 0x0a == *pStt && 0x0d == cLastCR )
392 pLastStt = ++pStt;
393 cLastCR = 0;
394 nLineLen = 0;
395 // We skip the last one at the end
396 if (!m_rInput.eof() || !(pEnd == pStt || (!*pEnd && pEnd == pStt + 1)))
397 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
401 bool bIns = true, bSplitNode = false;
402 switch( *pStt )
405 case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
407 bIns = false;
408 *pStt = 0;
409 ++pStt;
411 // We skip the last one at the end
412 if (!m_rInput.eof() || pEnd != pStt)
413 bSplitNode = true;
415 break;
417 case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
419 bIns = false;
420 *pStt = 0;
421 ++pStt;
423 bool bChkSplit = true;
424 if( LINEEND_CRLF == pUseMe->GetParaFlags() )
426 if( pStt == pEnd )
428 cLastCR = 0x0d;
429 bChkSplit = false;
431 else if( 0x0a == *pStt )
432 ++pStt;
435 // We skip the last one at the end
436 if (bChkSplit && (!m_rInput.eof() || pEnd != pStt))
437 bSplitNode = true;
439 break;
441 case 0x0c:
443 // Insert a hard page break
444 *pStt++ = 0;
445 if( nLineLen )
447 InsertText( OUString( pLastStt ));
449 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(),
450 false);
451 m_rDoc.getIDocumentContentOperations().InsertPoolItem(
452 *m_oPam, SvxFormatBreakItem(SvxBreak::PageBefore, RES_BREAK));
453 pLastStt = pStt;
454 nLineLen = 0;
455 bIns = false;
457 break;
459 case 0x1a:
460 if (nReadCnt == m_nFileSize && pStt + 1 == pEnd)
461 *pStt = 0;
462 else
463 *pStt = '#'; // Replacement visualisation
464 break;
466 case '\t': break;
468 default:
469 if( ' ' > *pStt )
470 // Found control char, replace with '#'
471 *pStt = '#';
472 break;
475 if( bIns )
477 if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
478 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
480 sal_Unicode c = *pStt;
481 *pStt = 0;
482 InsertText( OUString( pLastStt ));
483 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
484 pLastStt = pStt;
485 nLineLen = 0;
486 *pStt = c;
488 ++pStt;
489 ++nLineLen;
491 else if( bSplitNode )
493 // We found a CR/LF, thus save the text
494 InsertText( OUString( pLastStt ));
495 if (m_bNewDoc)
496 m_rDoc.getIDocumentContentOperations().AppendTextNode(*m_oPam->GetPoint());
497 else
498 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
499 pLastStt = pStt;
500 nLineLen = 0;
502 } while(true);
504 if( hConverter )
506 rtl_destroyTextToUnicodeContext( hConverter, hContext );
507 rtl_destroyTextToUnicodeConverter( hConverter );
509 return ERRCODE_NONE;
512 void SwASCIIParser::InsertText( const OUString& rStr )
514 m_rDoc.getIDocumentContentOperations().InsertString(*m_oPam, rStr);
516 if (m_oItemSet && g_pBreakIt
517 && m_nScript != (SvtScriptType::LATIN | SvtScriptType::ASIAN | SvtScriptType::COMPLEX))
518 m_nScript |= g_pBreakIt->GetAllScriptsOfText(rStr);
521 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */