sw/source/filter/ascii/parasc.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <boost/scoped_array.hpp>
  21 #include <tools/stream.hxx>
  22 #include <hintids.hxx>
  23 #include <rtl/tencinfo.h>
  24 #include <sfx2/printer.hxx>
  25 #include <editeng/fontitem.hxx>
  26 #include <editeng/langitem.hxx>
  27 #include <editeng/formatbreakitem.hxx>
  28 #include <editeng/scripttypeitem.hxx>
  29 #include <shellio.hxx>
  30 #include <doc.hxx>
  31 #include <swtypes.hxx>
  32 #include <ndtxt.hxx>
  33 #include <pam.hxx>
  34 #include <frmatr.hxx>
  35 #include <fltini.hxx>
  36 #include <pagedesc.hxx>
  37 #include <breakit.hxx>
  38 #include <swerror.h>
  39 #include <statstr.hrc>          // ResId for the status bar
  40 #include <mdiexp.hxx>           // ...Percent()
  41 #include <poolfmt.hxx>
  42
  43 #include "vcl/metric.hxx"
  44
  45 #define ASC_BUFFLEN 4096
  46
  47 class SwASCIIParser
  48 {
  49     SwDoc* pDoc;
  50     SwPaM* pPam;
  51     SvStream& rInput;
  52     sal_Char* pArr;
  53     const SwAsciiOptions& rOpt;
  54     SfxItemSet* pItemSet;
  55     long nFileSize;
  56     sal_uInt16 nScript;
  57     bool bNewDoc;
  58
  59     sal_uLong ReadChars();
  60     void InsertText( const String& rStr );
  61
  62 public:
  63     SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
  64                             int bReadNewDoc, const SwAsciiOptions& rOpts );
  65     ~SwASCIIParser();
  66
  67     sal_uLong CallParser();
  68 };
  69
  70
  71 // Call for the general reader interface
  72 sal_uLong AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
  73 {
  74     if( !pStrm )
  75     {
  76         OSL_ENSURE( !this, "ASCII read without a stream" );
  77         return ERR_SWG_READ_ERROR;
  78     }
  79
  80     SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
  81                                         !bInsertMode, aOpt.GetASCIIOpts() );
  82     sal_uLong nRet = pParser->CallParser();
  83
  84     delete pParser;
  85     // after Read reset the options
  86     aOpt.ResetASCIIOpts();
  87     return nRet;
  88 }
  89
  90 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
  91     int bReadNewDoc, const SwAsciiOptions& rOpts)
  92     : pDoc(pD), rInput(rIn), rOpt(rOpts), nFileSize(0), nScript(0)
  93     , bNewDoc(bReadNewDoc)
  94 {
  95     pPam = new SwPaM( *rCrsr.GetPoint() );
  96     pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
  97
  98     pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
  99                 RES_CHRATR_FONT,        RES_CHRATR_LANGUAGE,
 100                 RES_CHRATR_CJK_FONT,    RES_CHRATR_CJK_LANGUAGE,
 101                 RES_CHRATR_CTL_FONT,    RES_CHRATR_CTL_LANGUAGE,
 102                 0 );
 103
 104     // set defaults from the options
 105     if( rOpt.GetLanguage() )
 106     {
 107         SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
 108                                  RES_CHRATR_LANGUAGE );
 109         pItemSet->Put( aLang );
 110         pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
 111         pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
 112     }
 113     if( rOpt.GetFontName().Len() )
 114     {
 115         Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
 116         if( pDoc->getPrinter( false ) )
 117             aTextFont = pDoc->getPrinter( false )->GetFontMetric( aTextFont );
 118         SvxFontItem aFont( aTextFont.GetFamily(), aTextFont.GetName(),
 119                            aEmptyStr, aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
 120         pItemSet->Put( aFont );
 121         pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
 122         pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
 123     }
 124 }
 125
 126 SwASCIIParser::~SwASCIIParser()
 127 {
 128     delete pPam;
 129     delete [] pArr;
 130     delete pItemSet;
 131 }
 132
 133
 134 // Calling the parser
 135 sal_uLong SwASCIIParser::CallParser()
 136 {
 137     rInput.Seek(STREAM_SEEK_TO_END);
 138     rInput.ResetError();
 139
 140     nFileSize = rInput.Tell();
 141     rInput.Seek(STREAM_SEEK_TO_BEGIN);
 142     rInput.ResetError();
 143
 144     ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
 145
 146     SwPaM* pInsPam = 0;
 147     xub_StrLen nSttCntnt = 0;
 148     if (!bNewDoc)
 149     {
 150         const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
 151         pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
 152         nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
 153     }
 154
 155     SwTxtFmtColl *pColl = 0;
 156
 157     if (bNewDoc)
 158     {
 159         pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
 160         if (!pColl)
 161             pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
 162         if (pColl)
 163             pDoc->SetTxtFmtColl(*pPam, pColl);
 164     }
 165
 166     sal_uLong nError = ReadChars();
 167
 168     if( pItemSet )
 169     {
 170         // set only the attribute, for scanned scripts.
 171         if( !( SCRIPTTYPE_LATIN & nScript ))
 172         {
 173             pItemSet->ClearItem( RES_CHRATR_FONT );
 174             pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
 175         }
 176         if( !( SCRIPTTYPE_ASIAN & nScript ))
 177         {
 178             pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
 179             pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
 180         }
 181         if( !( SCRIPTTYPE_COMPLEX & nScript ))
 182         {
 183             pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
 184             pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
 185         }
 186         if( pItemSet->Count() )
 187         {
 188             if( bNewDoc )
 189             {
 190                 if (pColl)
 191                 {
 192                     // Using the pool defaults for the font causes significant
 193                     // trouble for the HTML filter, because it is not able
 194                     // to export the pool defaults (or to be more precise:
 195                     // the HTML filter is not able to detect whether a pool
 196                     // default has changed or not. Even a comparison with the
 197                     // HTMLi template does not work, because the defaults are
 198                     // not copied when a new doc is created. The result of
 199                     // comparing pool defaults therefor would be that the
 200                     // defaults are exported always if the have changed for
 201                     // text documents in general. That's not sensible, as well
 202                     // as it is not sensible to export them always.
 203                     sal_uInt16 aWhichIds[4] =
 204                     {
 205                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
 206                         RES_CHRATR_CTL_FONT, 0
 207                     };
 208                     sal_uInt16 *pWhichIds = aWhichIds;
 209                     while (*pWhichIds)
 210                     {
 211                         const SfxPoolItem *pItem;
 212                         if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
 213                             false, &pItem))
 214                         {
 215                             pColl->SetFmtAttr( *pItem );
 216                             pItemSet->ClearItem( *pWhichIds );
 217                         }
 218                         ++pWhichIds;
 219                     }
 220                 }
 221                 if (pItemSet->Count())
 222                     pDoc->SetDefault(*pItemSet);
 223             }
 224             else if( pInsPam )
 225             {
 226                 // then set over the insert range the defined attributes
 227                 *pInsPam->GetMark() = *pPam->GetPoint();
 228                 pInsPam->GetPoint()->nNode++;
 229                 pInsPam->GetPoint()->nContent.Assign(
 230                                     pInsPam->GetCntntNode(), nSttCntnt );
 231
 232                 // !!!!!
 233                 OSL_ENSURE( !this, "Have to change - hard attr. to para. style" );
 234                 pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
 235             }
 236         }
 237         delete pItemSet, pItemSet = 0;
 238     }
 239
 240     delete pInsPam;
 241
 242     ::EndProgress( pDoc->GetDocShell() );
 243     return nError;
 244 }
 245
 246 sal_uLong SwASCIIParser::ReadChars()
 247 {
 248     sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
 249     long nReadCnt = 0, nLineLen = 0;
 250     sal_Unicode cLastCR = 0;
 251     bool bSwapUnicode = false;
 252
 253     const SwAsciiOptions *pUseMe=&rOpt;
 254     SwAsciiOptions aEmpty;
 255     if (nFileSize >= 2 &&
 256         aEmpty.GetFontName() == rOpt.GetFontName() &&
 257         aEmpty.GetCharSet() == rOpt.GetCharSet() &&
 258         aEmpty.GetLanguage() == rOpt.GetLanguage() &&
 259         aEmpty.GetParaFlags() == rOpt.GetParaFlags())
 260     {
 261         sal_uLong nLen, nOrig;
 262         nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
 263         CharSet eCharSet;
 264         bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
 265         OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must "
 266             "have failed");
 267         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
 268         {
 269             aEmpty.SetCharSet(eCharSet);
 270             rInput.SeekRel(-(long(nLen)));
 271         }
 272         else
 273             rInput.SeekRel(-(long(nOrig)));
 274         pUseMe=&aEmpty;
 275     }
 276
 277     rtl_TextToUnicodeConverter hConverter=0;
 278     rtl_TextToUnicodeContext hContext=0;
 279     CharSet currentCharSet = pUseMe->GetCharSet();
 280     if (RTL_TEXTENCODING_UCS2 != currentCharSet)
 281     {
 282         if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
 283                 currentCharSet = RTL_TEXTENCODING_ASCII_US;
 284         hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
 285         OSL_ENSURE( hConverter, "no string convert available" );
 286         if (!hConverter)
 287             return ERROR_SW_READ_BASE;
 288         bSwapUnicode = false;
 289         hContext = rtl_createTextToUnicodeContext( hConverter );
 290     }
 291     else if (pUseMe != &aEmpty)  //Already successfully figured out type
 292     {
 293         rInput.StartReadingUnicodeText( currentCharSet );
 294         bSwapUnicode = rInput.IsEndianSwap();
 295     }
 296
 297     boost::scoped_array<sal_Unicode> aWork;
 298     sal_uLong nArrOffset = 0;
 299
 300     do {
 301         if( pStt >= pEnd )
 302         {
 303             if( pLastStt != pStt )
 304                 InsertText( OUString( pLastStt ));
 305
 306             // Read a new block
 307             sal_uLong lGCount;
 308             if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
 309                         rInput.Read( pArr + nArrOffset,
 310                                      ASC_BUFFLEN - nArrOffset )))
 311                 break;      // break from the while loop
 312
 313             /*
 314             If there was some unconverted bytes on the last cycle then they
 315             were put at the beginning of the array, so total bytes available
 316             to convert this cycle includes them. If we found 0 following bytes
 317             then we ignore the previous partial character.
 318             */
 319             lGCount+=nArrOffset;
 320
 321             if( hConverter )
 322             {
 323                 sal_uInt32 nInfo;
 324                 sal_Size nNewLen = lGCount, nCntBytes;
 325                 aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
 326                 sal_Unicode* pBuf = aWork.get();
 327
 328                 nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
 329                                 pArr, lGCount, pBuf, nNewLen,
 330                                 (
 331                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
 332                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 333                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
 334                                 RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
 335                                 ),
 336                                 &nInfo,
 337                                 &nCntBytes );
 338                 if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
 339                     memmove( pArr, pArr + nCntBytes, nArrOffset );
 340
 341                 pStt = pLastStt = aWork.get();
 342                 pEnd = pStt + nNewLen;
 343             }
 344             else
 345             {
 346                 pStt = pLastStt = (sal_Unicode*)pArr;
 347                 pEnd = (sal_Unicode*)(pArr + lGCount);
 348
 349                 if( bSwapUnicode )
 350                 {
 351                     sal_Char* pF = pArr, *pN = pArr + 1;
 352                     for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
 353                     {
 354                         sal_Char c = *pF;
 355                         *pF = *pN;
 356                         *pN = c;
 357                     }
 358                 }
 359             }
 360
 361             *pEnd = 0;
 362             nReadCnt += lGCount;
 363
 364             ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
 365
 366             if( cLastCR )
 367             {
 368                 if( 0x0a == *pStt && 0x0d == cLastCR )
 369                     pLastStt = ++pStt;
 370                 cLastCR = 0;
 371                 nLineLen = 0;
 372                 // We skip the last one at the end
 373                 if( !rInput.IsEof() || !(pEnd == pStt ||
 374                     ( !*pEnd && pEnd == pStt+1 ) ) )
 375                     pDoc->SplitNode( *pPam->GetPoint(), false );
 376             }
 377         }
 378
 379         bool bIns = true, bSplitNode = false;
 380         switch( *pStt )
 381         {
 382
 383         case 0x0a:  if( LINEEND_LF == pUseMe->GetParaFlags() )
 384                     {
 385                         bIns = false;
 386                         *pStt = 0;
 387                         ++pStt;
 388
 389                         // We skip the last one at the end
 390                         if( !rInput.IsEof() || pEnd != pStt )
 391                             bSplitNode = true;
 392                     }
 393                     break;
 394
 395         case 0x0d:  if( LINEEND_LF != pUseMe->GetParaFlags() )
 396                     {
 397                         bIns = false;
 398                         *pStt = 0;
 399                         ++pStt;
 400
 401                         bool bChkSplit = false;
 402                         if( LINEEND_CRLF == pUseMe->GetParaFlags() )
 403                         {
 404                             if( pStt == pEnd )
 405                                 cLastCR = 0x0d;
 406                             else if( 0x0a == *pStt )
 407                             {
 408                                 ++pStt;
 409                                 bChkSplit = true;
 410                             }
 411                         }
 412                         else
 413                             bChkSplit = true;
 414
 415                         // We skip the last one at the end
 416                         if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
 417                             bSplitNode = true;
 418                     }
 419                     break;
 420
 421         case 0x0c:
 422                     {
 423                         // Insert a hard page break
 424                         *pStt++ = 0;
 425                         if( nLineLen )
 426                         {
 427                             InsertText( OUString( pLastStt ));
 428                         }
 429                         pDoc->SplitNode( *pPam->GetPoint(), false );
 430                         pDoc->InsertPoolItem( *pPam, SvxFmtBreakItem(
 431                                     SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
 432                         pLastStt = pStt;
 433                         nLineLen = 0;
 434                         bIns = false;
 435                     }
 436                     break;
 437
 438         case 0x1a:
 439                     if( nReadCnt == nFileSize && pStt+1 == pEnd )
 440                         *pStt = 0;
 441                     else
 442                         *pStt = '#';        // Replacement visualisation
 443                     break;
 444
 445         case '\t':  break;
 446
 447         default:
 448             if( ' ' > *pStt )
 449             // Found control char, replace with '#'
 450                 *pStt = '#';
 451             break;
 452         }
 453
 454         if( bIns )
 455         {
 456             if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
 457                 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
 458             {
 459                 sal_Unicode c = *pStt;
 460                 *pStt = 0;
 461                 InsertText( OUString( pLastStt ));
 462                 pDoc->SplitNode( *pPam->GetPoint(), false );
 463                 pLastStt = pStt;
 464                 nLineLen = 0;
 465                 *pStt = c;
 466             }
 467             ++pStt;
 468             ++nLineLen;
 469         }
 470         else if( bSplitNode )
 471         {
 472             // We found a CR/LF, thus save the text
 473             InsertText( OUString( pLastStt ));
 474             pDoc->SplitNode( *pPam->GetPoint(), false );
 475             pLastStt = pStt;
 476             nLineLen = 0;
 477         }
 478     } while(true);
 479
 480     if( hConverter )
 481     {
 482         rtl_destroyTextToUnicodeContext( hConverter, hContext );
 483         rtl_destroyTextToUnicodeConverter( hConverter );
 484     }
 485     return 0;
 486 }
 487
 488 void SwASCIIParser::InsertText( const String& rStr )
 489 {
 490     pDoc->InsertString( *pPam, rStr );
 491     pDoc->UpdateRsid( *pPam, rStr.Len() );
 492     pDoc->UpdateParRsid( pPam->GetPoint()->nNode.GetNode().GetTxtNode() );
 493
 494     if( pItemSet && g_pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
 495                                              SCRIPTTYPE_ASIAN |
 496                                              SCRIPTTYPE_COMPLEX ) )
 497         nScript |= g_pBreakIt->GetAllScriptsOfText( rStr );
 498 }
 499
 500 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */