sw/source/filter/ascii/parasc.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <memory>
  21
  22 #include <tools/stream.hxx>
  23 #include <hintids.hxx>
  24 #include <sfx2/docfile.hxx>
  25 #include <sfx2/printer.hxx>
  26 #include <sfx2/sfxsids.hrc>
  27 #include <editeng/fontitem.hxx>
  28 #include <editeng/langitem.hxx>
  29 #include <editeng/formatbreakitem.hxx>
  30 #include <svl/languageoptions.hxx>
  31 #include <shellio.hxx>
  32 #include <doc.hxx>
  33 #include <IDocumentContentOperations.hxx>
  34 #include <IDocumentDeviceAccess.hxx>
  35 #include <IDocumentStylePoolAccess.hxx>
  36 #include <pam.hxx>
  37 #include <breakit.hxx>
  38 #include <swerror.h>
  39 #include <strings.hrc>
  40 #include <mdiexp.hxx>
  41 #include <poolfmt.hxx>
  42 #include <iodetect.hxx>
  43
  44 #include <vcl/metric.hxx>
  45 #include <osl/diagnose.h>
  46
  47 #define ASC_BUFFLEN 4096
  48
  49 namespace {
  50
  51 class SwASCIIParser
  52 {
  53     SwDoc& m_rDoc;
  54     std::optional<SwPaM> m_oPam;
  55     SvStream& m_rInput;
  56     std::unique_ptr<char[]> m_pArr;
  57     const SwAsciiOptions& m_rOpt;
  58     SwAsciiOptions m_usedAsciiOptions;
  59     std::optional<SfxItemSet> m_oItemSet;
  60     tools::Long m_nFileSize;
  61     SvtScriptType m_nScript;
  62     bool m_bNewDoc;
  63
  64     ErrCode ReadChars();
  65     void InsertText( const OUString& rStr );
  66
  67     SwASCIIParser(const SwASCIIParser&) = delete;
  68     SwASCIIParser& operator=(const SwASCIIParser&) = delete;
  69
  70 public:
  71     SwASCIIParser( SwDoc& rD, const SwPaM& rCursor, SvStream& rIn,
  72                             bool bReadNewDoc, const SwAsciiOptions& rOpts );
  73
  74     ErrCode CallParser();
  75     const SwAsciiOptions& GetUsedAsciiOptions() const { return m_usedAsciiOptions; }
  76 };
  77
  78 }
  79
  80 // Call for the general reader interface
  81 ErrCode AsciiReader::Read( SwDoc& rDoc, const OUString&, SwPaM &rPam, const OUString & )
  82 {
  83     if( !m_pStream )
  84     {
  85         OSL_ENSURE( false, "ASCII read without a stream" );
  86         return ERR_SWG_READ_ERROR;
  87     }
  88
  89     ErrCode nRet;
  90     {
  91         SwASCIIParser aParser( rDoc, rPam, *m_pStream,
  92                                 !m_bInsertMode, m_aOption.GetASCIIOpts() );
  93         nRet = aParser.CallParser();
  94
  95         OUString optionsString;
  96         aParser.GetUsedAsciiOptions().WriteUserData(optionsString);
  97
  98         if(m_pMedium != nullptr && m_pMedium->GetItemSet() != nullptr)
  99             m_pMedium->GetItemSet()->Put(SfxStringItem(SID_FILE_FILTEROPTIONS, optionsString));
 100     }
 101     // after Read reset the options
 102     m_aOption.ResetASCIIOpts();
 103     return nRet;
 104 }
 105
 106 SwASCIIParser::SwASCIIParser(SwDoc& rD, const SwPaM& rCursor, SvStream& rIn, bool bReadNewDoc,
 107                              const SwAsciiOptions& rOpts)
 108     : m_rDoc(rD)
 109     , m_rInput(rIn)
 110     , m_rOpt(rOpts)
 111     , m_usedAsciiOptions(rOpts)
 112     , m_nFileSize(0)
 113     , m_nScript(SvtScriptType::NONE)
 114     , m_bNewDoc(bReadNewDoc)
 115 {
 116     m_oPam.emplace(*rCursor.GetPoint());
 117     m_pArr.reset(new char[ASC_BUFFLEN + 2]);
 118
 119     m_oItemSet.emplace(
 120         m_rDoc.GetAttrPool(),
 121         svl::Items<RES_CHRATR_FONT, RES_CHRATR_LANGUAGE, RES_CHRATR_CJK_FONT,
 122                    RES_CHRATR_CJK_LANGUAGE, RES_CHRATR_CTL_FONT, RES_CHRATR_CTL_LANGUAGE>);
 123
 124     // set defaults from the options
 125     if (m_rOpt.GetLanguage())
 126     {
 127         SvxLanguageItem aLang(m_rOpt.GetLanguage(), RES_CHRATR_LANGUAGE);
 128         m_oItemSet->Put(aLang);
 129         aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
 130         m_oItemSet->Put(aLang);
 131         aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
 132         m_oItemSet->Put(aLang);
 133     }
 134     if (m_rOpt.GetFontName().isEmpty())
 135         return;
 136
 137     vcl::Font aTextFont(m_rOpt.GetFontName(), Size(0, 10));
 138     if (m_rDoc.getIDocumentDeviceAccess().getPrinter(false))
 139         aTextFont = m_rDoc.getIDocumentDeviceAccess().getPrinter(false)->GetFontMetric(aTextFont);
 140     SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
 141                        OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
 142     m_oItemSet->Put(aFont);
 143     aFont.SetWhich(RES_CHRATR_CJK_FONT);
 144     m_oItemSet->Put(aFont);
 145     aFont.SetWhich(RES_CHRATR_CTL_FONT);
 146     m_oItemSet->Put(aFont);
 147 }
 148
 149 // Calling the parser
 150 ErrCode SwASCIIParser::CallParser()
 151 {
 152     m_rInput.ResetError();
 153     m_nFileSize = m_rInput.TellEnd();
 154     m_rInput.Seek(STREAM_SEEK_TO_BEGIN);
 155     m_rInput.ResetError();
 156
 157     ::StartProgress(STR_STATSTR_W4WREAD, 0, m_nFileSize, m_rDoc.GetDocShell());
 158
 159     std::optional<SwPaM> pInsPam;
 160     sal_Int32 nSttContent = 0;
 161     if (!m_bNewDoc)
 162     {
 163         const SwNode& rTmp = m_oPam->GetPoint()->GetNode();
 164         pInsPam.emplace( rTmp, rTmp, SwNodeOffset(0), SwNodeOffset(-1) );
 165         nSttContent = m_oPam->GetPoint()->GetContentIndex();
 166     }
 167
 168     SwTextFormatColl *pColl = nullptr;
 169
 170     if (m_bNewDoc)
 171     {
 172         pColl = m_rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE,
 173                                                                          false);
 174         if (!pColl)
 175             pColl = m_rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD,
 176                                                                              false);
 177         if (pColl)
 178             m_rDoc.SetTextFormatColl(*m_oPam, pColl);
 179     }
 180
 181     ErrCode nError = ReadChars();
 182
 183     if (m_oItemSet)
 184     {
 185         // set only the attribute, for scanned scripts.
 186         if (!(SvtScriptType::LATIN & m_nScript))
 187         {
 188             m_oItemSet->ClearItem(RES_CHRATR_FONT);
 189             m_oItemSet->ClearItem(RES_CHRATR_LANGUAGE);
 190         }
 191         if (!(SvtScriptType::ASIAN & m_nScript))
 192         {
 193             m_oItemSet->ClearItem(RES_CHRATR_CJK_FONT);
 194             m_oItemSet->ClearItem(RES_CHRATR_CJK_LANGUAGE);
 195         }
 196         if (!(SvtScriptType::COMPLEX & m_nScript))
 197         {
 198             m_oItemSet->ClearItem(RES_CHRATR_CTL_FONT);
 199             m_oItemSet->ClearItem(RES_CHRATR_CTL_LANGUAGE);
 200         }
 201         if (m_oItemSet->Count())
 202         {
 203             if (m_bNewDoc)
 204             {
 205                 if (pColl)
 206                 {
 207                     // Using the pool defaults for the font causes significant
 208                     // trouble for the HTML filter, because it is not able
 209                     // to export the pool defaults (or to be more precise:
 210                     // the HTML filter is not able to detect whether a pool
 211                     // default has changed or not. Even a comparison with the
 212                     // HTML template does not work, because the defaults are
 213                     // not copied when a new doc is created. The result of
 214                     // comparing pool defaults therefore would be that the
 215                     // defaults are exported always if the have changed for
 216                     // text documents in general. That's not sensible, as well
 217                     // as it is not sensible to export them always.
 218                     sal_uInt16 aWhichIds[4] =
 219                     {
 220                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
 221                         RES_CHRATR_CTL_FONT, 0
 222                     };
 223                     sal_uInt16 *pWhichIds = aWhichIds;
 224                     while (*pWhichIds)
 225                     {
 226                         const SfxPoolItem *pItem;
 227                         if (SfxItemState::SET
 228                             == m_oItemSet->GetItemState(*pWhichIds, false, &pItem))
 229                         {
 230                             pColl->SetFormatAttr( *pItem );
 231                             m_oItemSet->ClearItem(*pWhichIds);
 232                         }
 233                         ++pWhichIds;
 234                     }
 235                 }
 236                 if (m_oItemSet->Count())
 237                     m_rDoc.SetDefault(*m_oItemSet);
 238             }
 239             else if( pInsPam )
 240             {
 241                 // then set over the insert range the defined attributes
 242                 *pInsPam->GetMark() = *m_oPam->GetPoint();
 243                 pInsPam->GetPoint()->Assign(pInsPam->GetPoint()->GetNode(), SwNodeOffset(1),
 244                                     nSttContent );
 245
 246                 // !!!!!
 247                 OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
 248                 m_rDoc.getIDocumentContentOperations().InsertItemSet(*pInsPam, *m_oItemSet);
 249             }
 250         }
 251         m_oItemSet.reset();
 252     }
 253
 254     pInsPam.reset();
 255
 256     ::EndProgress(m_rDoc.GetDocShell());
 257     return nError;
 258 }
 259
 260 ErrCode SwASCIIParser::ReadChars()
 261 {
 262     sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
 263     tools::Long nReadCnt = 0, nLineLen = 0;
 264     sal_Unicode cLastCR = 0;
 265     bool bSwapUnicode = false;
 266
 267     const SwAsciiOptions* pUseMe = &m_rOpt;
 268     SwAsciiOptions aEmpty;
 269     if (m_nFileSize >= 2 && aEmpty.GetFontName() == m_rOpt.GetFontName()
 270         && aEmpty.GetCharSet() == m_rOpt.GetCharSet()
 271         && aEmpty.GetLanguage() == m_rOpt.GetLanguage()
 272         && aEmpty.GetParaFlags() == m_rOpt.GetParaFlags())
 273     {
 274         sal_Size nLen, nOrig;
 275         nOrig = nLen = m_rInput.ReadBytes(m_pArr.get(), ASC_BUFFLEN);
 276         rtl_TextEncoding eCharSet;
 277         LineEnd eLineEnd;
 278         bool bHasBom;
 279         const bool bRet
 280             = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet,
 281                                             &bSwapUnicode, &eLineEnd, &bHasBom);
 282         if (!bRet)
 283             return ERRCODE_IO_BROKENPACKAGE;
 284
 285         OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
 286         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
 287         {
 288             aEmpty.SetCharSet(eCharSet);
 289             aEmpty.SetParaFlags(eLineEnd);
 290             aEmpty.SetIncludeBOM(bHasBom);
 291             m_rInput.SeekRel(-(tools::Long(nLen)));
 292         }
 293         else
 294             m_rInput.SeekRel(-(tools::Long(nOrig)));
 295         pUseMe=&aEmpty;
 296     }
 297     m_usedAsciiOptions = *pUseMe;
 298
 299     rtl_TextToUnicodeConverter hConverter=nullptr;
 300     rtl_TextToUnicodeContext hContext=nullptr;
 301     rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
 302     if (RTL_TEXTENCODING_UCS2 != currentCharSet)
 303     {
 304         if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
 305                 currentCharSet = RTL_TEXTENCODING_ASCII_US;
 306         hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
 307         OSL_ENSURE( hConverter, "no string convert available" );
 308         if (!hConverter)
 309             return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
 310         bSwapUnicode = false;
 311         hContext = rtl_createTextToUnicodeContext( hConverter );
 312     }
 313     else if (pUseMe != &aEmpty)  //Already successfully figured out type
 314     {
 315         m_rInput.StartReadingUnicodeText(currentCharSet);
 316         bSwapUnicode = m_rInput.IsEndianSwap();
 317     }
 318
 319     std::unique_ptr<sal_Unicode[]> aWork;
 320     sal_Size nArrOffset = 0;
 321
 322     do {
 323         if( pStt >= pEnd )
 324         {
 325             if( pLastStt != pStt )
 326                 InsertText( OUString( pLastStt ));
 327
 328             // Read a new block
 329             sal_Size lGCount;
 330             if (ERRCODE_NONE != m_rInput.GetError()
 331                 || 0
 332                        == (lGCount = m_rInput.ReadBytes(m_pArr.get() + nArrOffset,
 333                                                         ASC_BUFFLEN - nArrOffset)))
 334                 break;      // break from the while loop
 335
 336             /*
 337             If there was some unconverted bytes on the last cycle then they
 338             were put at the beginning of the array, so total bytes available
 339             to convert this cycle includes them. If we found 0 following bytes
 340             then we ignore the previous partial character.
 341             */
 342             lGCount += nArrOffset;
 343
 344             if( hConverter )
 345             {
 346                 sal_uInt32 nInfo;
 347                 sal_Size nNewLen = lGCount, nCntBytes;
 348                 aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
 349                 sal_Unicode* pBuf = aWork.get();
 350                 pBuf[nNewLen] = 0;                         // ensure '\0'
 351
 352                 nNewLen = rtl_convertTextToUnicode(hConverter, hContext, m_pArr.get(), lGCount,
 353                                                    pBuf, nNewLen,
 354                                                    (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
 355                                                     | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
 356                                                     | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
 357                                                     | RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE),
 358                                                    &nInfo, &nCntBytes);
 359                 nArrOffset = lGCount - nCntBytes;
 360                 if( 0 != nArrOffset )
 361                     memmove(m_pArr.get(), m_pArr.get() + nCntBytes, nArrOffset);
 362
 363                 pStt = pLastStt = aWork.get();
 364                 pEnd = pStt + nNewLen;
 365             }
 366             else
 367             {
 368                 pStt = pLastStt = reinterpret_cast<sal_Unicode*>(m_pArr.get());
 369                 auto nChars = lGCount / 2;
 370                 pEnd = pStt + nChars;
 371
 372                 if( bSwapUnicode )
 373                 {
 374                     char *pF = m_pArr.get(), *pN = m_pArr.get() + 1;
 375                     for (sal_Size n = 0; n < nChars; ++n, pF += 2, pN += 2)
 376                     {
 377                         char c = *pF;
 378                         *pF = *pN;
 379                         *pN = c;
 380                     }
 381                 }
 382             }
 383
 384             *pEnd = 0;
 385             nReadCnt += lGCount;
 386
 387             ::SetProgressState(nReadCnt, m_rDoc.GetDocShell());
 388
 389             if( cLastCR )
 390             {
 391                 if( 0x0a == *pStt && 0x0d == cLastCR )
 392                     pLastStt = ++pStt;
 393                 cLastCR = 0;
 394                 nLineLen = 0;
 395                 // We skip the last one at the end
 396                 if (!m_rInput.eof() || !(pEnd == pStt || (!*pEnd && pEnd == pStt + 1)))
 397                     m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
 398             }
 399         }
 400
 401         bool bIns = true, bSplitNode = false;
 402         switch( *pStt )
 403         {
 404
 405         case 0x0a:  if( LINEEND_LF == pUseMe->GetParaFlags() )
 406                     {
 407                         bIns = false;
 408                         *pStt = 0;
 409                         ++pStt;
 410
 411                         // We skip the last one at the end
 412                         if (!m_rInput.eof() || pEnd != pStt)
 413                             bSplitNode = true;
 414                     }
 415                     break;
 416
 417         case 0x0d:  if( LINEEND_LF != pUseMe->GetParaFlags() )
 418                     {
 419                         bIns = false;
 420                         *pStt = 0;
 421                         ++pStt;
 422
 423                         bool bChkSplit = true;
 424                         if( LINEEND_CRLF == pUseMe->GetParaFlags() )
 425                         {
 426                             if( pStt == pEnd )
 427                             {
 428                                 cLastCR = 0x0d;
 429                                 bChkSplit = false;
 430                             }
 431                             else if( 0x0a == *pStt )
 432                                 ++pStt;
 433                         }
 434
 435                         // We skip the last one at the end
 436                         if (bChkSplit && (!m_rInput.eof() || pEnd != pStt))
 437                             bSplitNode = true;
 438                     }
 439                     break;
 440
 441         case 0x0c:
 442                     {
 443                         // Insert a hard page break
 444                         *pStt++ = 0;
 445                         if( nLineLen )
 446                         {
 447                             InsertText( OUString( pLastStt ));
 448                         }
 449                         m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(),
 450                                                                          false);
 451                         m_rDoc.getIDocumentContentOperations().InsertPoolItem(
 452                             *m_oPam, SvxFormatBreakItem(SvxBreak::PageBefore, RES_BREAK));
 453                         pLastStt = pStt;
 454                         nLineLen = 0;
 455                         bIns = false;
 456                     }
 457                     break;
 458
 459         case 0x1a:
 460             if (nReadCnt == m_nFileSize && pStt + 1 == pEnd)
 461                 *pStt = 0;
 462             else
 463                 *pStt = '#'; // Replacement visualisation
 464             break;
 465
 466         case '\t':  break;
 467
 468         default:
 469             if( ' ' > *pStt )
 470             // Found control char, replace with '#'
 471                 *pStt = '#';
 472             break;
 473         }
 474
 475         if( bIns )
 476         {
 477             if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
 478                 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
 479             {
 480                 sal_Unicode c = *pStt;
 481                 *pStt = 0;
 482                 InsertText( OUString( pLastStt ));
 483                 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
 484                 pLastStt = pStt;
 485                 nLineLen = 0;
 486                 *pStt = c;
 487             }
 488             ++pStt;
 489             ++nLineLen;
 490         }
 491         else if( bSplitNode )
 492         {
 493             // We found a CR/LF, thus save the text
 494             InsertText( OUString( pLastStt ));
 495             if (m_bNewDoc)
 496                 m_rDoc.getIDocumentContentOperations().AppendTextNode(*m_oPam->GetPoint());
 497             else
 498                 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
 499             pLastStt = pStt;
 500             nLineLen = 0;
 501         }
 502     } while(true);
 503
 504     if( hConverter )
 505     {
 506         rtl_destroyTextToUnicodeContext( hConverter, hContext );
 507         rtl_destroyTextToUnicodeConverter( hConverter );
 508     }
 509     return ERRCODE_NONE;
 510 }
 511
 512 void SwASCIIParser::InsertText( const OUString& rStr )
 513 {
 514     m_rDoc.getIDocumentContentOperations().InsertString(*m_oPam, rStr);
 515
 516     if (m_oItemSet && g_pBreakIt
 517         && m_nScript != (SvtScriptType::LATIN | SvtScriptType::ASIAN | SvtScriptType::COMPLEX))
 518         m_nScript |= g_pBreakIt->GetAllScriptsOfText(rStr);
 519 }
 520
 521 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */