sw/source/filter/ascii/parasc.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: parasc.cxx,v $
  10  * $Revision: 1.30 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_sw.hxx"
  33 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
  34
  35
  36 #include <tools/stream.hxx>
  37 #include <hintids.hxx>
  38 #include <rtl/tencinfo.h>
  39 #include <sfx2/printer.hxx>
  40 #include <svx/fontitem.hxx>
  41 #include <svx/langitem.hxx>
  42 #include <svx/brkitem.hxx>
  43 #include <svx/scripttypeitem.hxx>
  44 #include <shellio.hxx>
  45 #include <doc.hxx>
  46 #include <swtypes.hxx>
  47 #include <ndtxt.hxx>
  48 #include <pam.hxx>
  49 #include <frmatr.hxx>
  50 #include <fltini.hxx>
  51 #include <pagedesc.hxx>
  52 #include <breakit.hxx>
  53 #include <swerror.h>
  54 #ifndef _STATSTR_HRC
  55 #include <statstr.hrc>          // ResId fuer Statusleiste
  56 #endif
  57 #include <mdiexp.hxx>           // ...Percent()
  58 #include <poolfmt.hxx>
  59
  60 #define ASC_BUFFLEN 4096
  61
  62 class SwASCIIParser
  63 {
  64     SwDoc* pDoc;
  65     SwPaM* pPam;
  66     SvStream& rInput;
  67     sal_Char* pArr;
  68     const SwAsciiOptions& rOpt;
  69     SfxItemSet* pItemSet;
  70     long nFileSize;
  71     USHORT nScript;
  72     bool bNewDoc;
  73
  74     ULONG ReadChars();
  75     void InsertText( const String& rStr );
  76
  77 public:
  78     SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
  79                             int bReadNewDoc, const SwAsciiOptions& rOpts );
  80     ~SwASCIIParser();
  81
  82     ULONG CallParser();
  83 };
  84
  85
  86 // Aufruf fuer die allg. Reader-Schnittstelle
  87 ULONG AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
  88 {
  89     if( !pStrm )
  90     {
  91         ASSERT( !this, "ASCII-Read ohne Stream" );
  92         return ERR_SWG_READ_ERROR;
  93     }
  94
  95     //JP 18.01.96: Alle Ueberschriften sind normalerweise ohne
  96     //              Kapitelnummer. Darum hier explizit abschalten
  97     //              weil das Default jetzt wieder auf AN ist.
  98     if( !bInsertMode )
  99         Reader::SetNoOutlineNum( rDoc );
 100
 101     SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
 102                                         !bInsertMode, aOpt.GetASCIIOpts() );
 103     ULONG nRet = pParser->CallParser();
 104
 105     delete pParser;
 106     // after Read reset the options
 107     aOpt.ResetASCIIOpts();
 108     return nRet;
 109 }
 110
 111 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
 112     int bReadNewDoc, const SwAsciiOptions& rOpts)
 113     : pDoc(pD), rInput(rIn), rOpt(rOpts), nScript(0), bNewDoc(bReadNewDoc)
 114 {
 115     pPam = new SwPaM( *rCrsr.GetPoint() );
 116     pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
 117
 118     pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
 119                 RES_CHRATR_FONT,                RES_CHRATR_LANGUAGE,
 120                 RES_CHRATR_CJK_FONT,    RES_CHRATR_CJK_LANGUAGE,
 121                 RES_CHRATR_CTL_FONT,    RES_CHRATR_CTL_LANGUAGE,
 122                 0 );
 123
 124     // set defaults from the options
 125     if( rOpt.GetLanguage() )
 126     {
 127         SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
 128                                  RES_CHRATR_LANGUAGE );
 129         pItemSet->Put( aLang );
 130         pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
 131         pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
 132     }
 133     if( rOpt.GetFontName().Len() )
 134     {
 135         bool bDelete = false;
 136         const SfxFont* pFnt = 0;
 137         if( pDoc->getPrinter( false ) )
 138             pFnt = pDoc->getPrinter( false )->GetFontByName( rOpt.GetFontName() );
 139
 140         if( !pFnt )
 141         {
 142             pFnt = new SfxFont( FAMILY_DONTKNOW, rOpt.GetFontName() );
 143             bDelete = true;
 144         }
 145         SvxFontItem aFont( pFnt->GetFamily(), pFnt->GetName(),
 146                         aEmptyStr, pFnt->GetPitch(), pFnt->GetCharSet(), RES_CHRATR_FONT );
 147         pItemSet->Put( aFont );
 148         pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
 149         pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
 150
 151         if( bDelete )
 152             delete (SfxFont*)pFnt;
 153     }
 154 }
 155
 156 SwASCIIParser::~SwASCIIParser()
 157 {
 158     delete pPam;
 159     delete [] pArr;
 160     delete pItemSet;
 161 }
 162
 163
 164 // Aufruf des Parsers
 165 ULONG SwASCIIParser::CallParser()
 166 {
 167     rInput.Seek(STREAM_SEEK_TO_END);
 168     rInput.ResetError();
 169
 170     nFileSize = rInput.Tell();
 171     rInput.Seek(STREAM_SEEK_TO_BEGIN);
 172     rInput.ResetError();
 173
 174     ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
 175
 176     SwPaM* pInsPam = 0;
 177     xub_StrLen nSttCntnt = 0;
 178     if (!bNewDoc)
 179     {
 180         const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
 181         pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
 182         nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
 183     }
 184
 185     SwTxtFmtColl *pColl = 0;
 186
 187     if (bNewDoc)
 188     {
 189         pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
 190         if (!pColl)
 191             pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
 192         if (pColl)
 193             pDoc->SetTxtFmtColl(*pPam, pColl);
 194     }
 195
 196     ULONG nError = ReadChars();
 197
 198     if( pItemSet )
 199     {
 200         // set only the attribute, for scanned scripts.
 201         if( !( SCRIPTTYPE_LATIN & nScript ))
 202         {
 203             pItemSet->ClearItem( RES_CHRATR_FONT );
 204             pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
 205         }
 206         if( !( SCRIPTTYPE_ASIAN & nScript ))
 207         {
 208             pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
 209             pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
 210         }
 211         if( !( SCRIPTTYPE_COMPLEX & nScript ))
 212         {
 213             pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
 214             pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
 215         }
 216         if( pItemSet->Count() )
 217         {
 218             if( bNewDoc )
 219             {
 220                 if (pColl)
 221                 {
 222                     // Using the pool defaults for the font causes significant
 223                     // trouble for the HTML filter, because it is not able
 224                     // to export the pool defaults (or to be more precice:
 225                     // the HTML filter is not able to detect whether a pool
 226                     // default has changed or not. Even a comparison with the
 227                     // HTMLi template does not work, because the defaults are
 228                     // not copied when a new doc is created. The result of
 229                     // comparing pool defaults therfor would be that the
 230                     // defaults are exported always if the have changed for
 231                     // text documents in general. That's not sensible, as well
 232                     // as it is not sensible to export them always.
 233                     sal_uInt16 aWhichIds[4] =
 234                     {
 235                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
 236                         RES_CHRATR_CTL_FONT, 0
 237                     };
 238                     sal_uInt16 *pWhichIds = aWhichIds;
 239                     while (*pWhichIds)
 240                     {
 241                         const SfxPoolItem *pItem;
 242                         if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
 243                             false, &pItem))
 244                         {
 245                             pColl->SetFmtAttr( *pItem );
 246                             pItemSet->ClearItem( *pWhichIds );
 247                         }
 248                         ++pWhichIds;
 249                     }
 250                 }
 251                 if (pItemSet->Count())
 252                     pDoc->SetDefault(*pItemSet);
 253             }
 254             else if( pInsPam )
 255             {
 256                 // then set over the insert range the defined attributes
 257                 *pInsPam->GetMark() = *pPam->GetPoint();
 258                 pInsPam->GetPoint()->nNode++;
 259                 pInsPam->GetPoint()->nContent.Assign(
 260                                     pInsPam->GetCntntNode(), nSttCntnt );
 261
 262                 // !!!!!
 263                 ASSERT( !this, "Have to change - hard attr. to para. style" );
 264                 pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
 265             }
 266         }
 267         delete pItemSet, pItemSet = 0;
 268     }
 269
 270     if( pInsPam )
 271         delete pInsPam;
 272
 273     ::EndProgress( pDoc->GetDocShell() );
 274     return nError;
 275 }
 276
 277 ULONG SwASCIIParser::ReadChars()
 278 {
 279     sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
 280     long nReadCnt = 0, nLineLen = 0;
 281     sal_Unicode cLastCR = 0;
 282     bool bSwapUnicode = false;
 283
 284     const SwAsciiOptions *pUseMe=&rOpt;
 285     SwAsciiOptions aEmpty;
 286     if (nFileSize >= 2 &&
 287         aEmpty.GetFontName() == rOpt.GetFontName() &&
 288         aEmpty.GetCharSet() == rOpt.GetCharSet() &&
 289         aEmpty.GetLanguage() == rOpt.GetLanguage() &&
 290         aEmpty.GetParaFlags() == rOpt.GetParaFlags())
 291     {
 292         ULONG nLen, nOrig;
 293         nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
 294         CharSet eCharSet;
 295         bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
 296         ASSERT(bRet, "Autodetect of text import without nag dialog must "
 297             "have failed");
 298         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
 299         {
 300             aEmpty.SetCharSet(eCharSet);
 301             rInput.SeekRel(-(long(nLen)));
 302         }
 303         else
 304             rInput.SeekRel(-(long(nOrig)));
 305         pUseMe=&aEmpty;
 306     }
 307
 308     rtl_TextToUnicodeConverter hConverter=0;
 309     rtl_TextToUnicodeContext hContext=0;
 310     CharSet currentCharSet = pUseMe->GetCharSet();
 311     if (RTL_TEXTENCODING_UCS2 != currentCharSet)
 312     {
 313         if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
 314                 currentCharSet = RTL_TEXTENCODING_ASCII_US;
 315         hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
 316         ASSERT( hConverter, "no string convert avaiable" );
 317         if (!hConverter)
 318             return ERROR_SW_READ_BASE;
 319         bSwapUnicode = false;
 320         hContext = rtl_createTextToUnicodeContext( hConverter );
 321     }
 322     else if (pUseMe != &aEmpty)  //Already successfully figured out type
 323     {
 324         rInput.StartReadingUnicodeText();
 325         bSwapUnicode = rInput.IsEndianSwap();
 326     }
 327
 328     String sWork;
 329     ULONG nArrOffset = 0;
 330
 331     do {
 332         if( pStt >= pEnd )
 333         {
 334             if( pLastStt != pStt )
 335                 InsertText( String( pLastStt ));
 336
 337             // lese einen neuen Block ein
 338             ULONG lGCount;
 339             if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
 340                         rInput.Read( pArr + nArrOffset,
 341                                      ASC_BUFFLEN - nArrOffset )))
 342                 break;          // aus der WHILE-Schleife heraus
 343
 344             /*
 345             #98380#
 346             If there was some unconverted bytes on the last cycle then they
 347             were put at the beginning of the array, so total bytes available
 348             to convert this cycle includes them. If we found 0 following bytes
 349             then we ignore the previous partial character.
 350             */
 351             lGCount+=nArrOffset;
 352
 353             if( hConverter )
 354             {
 355                 sal_uInt32 nInfo;
 356                 sal_Size nNewLen = lGCount, nCntBytes;
 357                 sal_Unicode* pBuf = sWork.AllocBuffer( static_cast< xub_StrLen >(nNewLen) );
 358
 359                 nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
 360                                 pArr, lGCount, pBuf, nNewLen,
 361                                 (
 362                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
 363                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 364                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
 365                                 RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
 366                                 ),
 367                                 &nInfo,
 368                                 &nCntBytes );
 369                 if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
 370                     memmove( pArr, pArr + nCntBytes, nArrOffset );
 371                 sWork.ReleaseBufferAccess( static_cast< xub_StrLen >(nNewLen) );
 372
 373                 pStt = pLastStt = sWork.GetBufferAccess();
 374                 pEnd = pStt + nNewLen;
 375             }
 376             else
 377             {
 378                 pStt = pLastStt = (sal_Unicode*)pArr;
 379                 pEnd = (sal_Unicode*)(pArr + lGCount);
 380
 381                 if( bSwapUnicode )
 382                 {
 383                     sal_Char* pF = pArr, *pN = pArr + 1;
 384                     for( ULONG n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
 385                     {
 386                         sal_Char c = *pF;
 387                         *pF = *pN;
 388                         *pN = c;
 389                     }
 390                 }
 391             }
 392
 393             *pEnd = 0;
 394             nReadCnt += lGCount;
 395
 396             ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
 397
 398             if( cLastCR )
 399             {
 400                 if( 0x0a == *pStt && 0x0d == cLastCR )
 401                     pLastStt = ++pStt;
 402                 cLastCR = 0;
 403                 nLineLen = 0;
 404                 // JP 03.04.96: das letze am Ende nehmen wir nicht
 405                 if( !rInput.IsEof() || !(pEnd == pStt ||
 406                     ( !*pEnd && pEnd == pStt+1 ) ) )
 407                     pDoc->SplitNode( *pPam->GetPoint(), false );
 408             }
 409         }
 410
 411         bool bIns = true, bSplitNode = false;
 412         switch( *pStt )
 413         {
 414 //JP 12.11.2001: task 94636 - don't ignore all behind the zero character,
 415 //                                                        change it to the default "control character"
 416 //              case 0:
 417 //                                      pEnd = pStt;
 418 //                                      bIns = false ;
 419 //                                      break;
 420
 421         case 0x0a:      if( LINEEND_LF == pUseMe->GetParaFlags() )
 422                     {
 423                         bIns = false;
 424                         *pStt = 0;
 425                         ++pStt;
 426
 427                         // JP 03.04.96: das letze am Ende nehmen wir nicht
 428                         if( !rInput.IsEof() || pEnd != pStt )
 429                             bSplitNode = true;
 430                     }
 431                     break;
 432
 433         case 0x0d:      if( LINEEND_LF != pUseMe->GetParaFlags() )
 434                     {
 435                         bIns = false;
 436                         *pStt = 0;
 437                         ++pStt;
 438
 439                         bool bChkSplit = false;
 440                         if( LINEEND_CRLF == pUseMe->GetParaFlags() )
 441                         {
 442                             if( pStt == pEnd )
 443                                 cLastCR = 0x0d;
 444                             else if( 0x0a == *pStt )
 445                             {
 446                                 ++pStt;
 447                                 bChkSplit = true;
 448                             }
 449                         }
 450                         else
 451                             bChkSplit = true;
 452
 453                             // JP 03.04.96: das letze am Ende nehmen wir nicht
 454                         if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
 455                             bSplitNode = true;
 456                     }
 457                     break;
 458
 459         case 0x0c:
 460                     {
 461                         // dann mal einen harten Seitenumbruch einfuegen
 462                         *pStt++ = 0;
 463                         if( nLineLen )
 464                         {
 465                             // Change to charset system!!!!
 466                             //rOpt.GetCharSet();
 467                             InsertText( String( pLastStt ));
 468                         }
 469                         pDoc->SplitNode( *pPam->GetPoint(), false );
 470                         pDoc->InsertPoolItem( *pPam, SvxFmtBreakItem(
 471                                     SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
 472                         pLastStt = pStt;
 473                         nLineLen = 0;
 474                         bIns = false;
 475                     }
 476                     break;
 477
 478         case 0x1a:
 479                     if( nReadCnt == nFileSize && pStt+1 == pEnd )
 480                         *pStt = 0;
 481                     else
 482                         *pStt = '#';        // Ersatzdarstellung
 483                     break;
 484
 485         case '\t':      break;
 486
 487         default:
 488             if( ' ' > *pStt )
 489                     // Ctrl-Zchn gefunden ersetze durch '#'
 490                 *pStt = '#';
 491             break;
 492         }
 493
 494         if( bIns )
 495         {
 496             if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
 497                 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
 498             {
 499                 sal_Unicode c = *pStt;
 500                 *pStt = 0;
 501                 InsertText( String( pLastStt ));
 502                 pDoc->SplitNode( *pPam->GetPoint(), false );
 503                 pLastStt = pStt;
 504                 nLineLen = 0;
 505                 *pStt = c;
 506             }
 507             ++pStt;
 508             ++nLineLen;
 509         }
 510         else if( bSplitNode )
 511         {
 512             // es wurde ein CR/LF erkannt, also speichere den Text
 513
 514             InsertText( String( pLastStt ));
 515             pDoc->SplitNode( *pPam->GetPoint(), false );
 516             pLastStt = pStt;
 517             nLineLen = 0;
 518         }
 519     } while(true);
 520
 521     if( hConverter )
 522     {
 523         rtl_destroyTextToUnicodeContext( hConverter, hContext );
 524         rtl_destroyTextToUnicodeConverter( hConverter );
 525     }
 526     return 0;
 527 }
 528
 529 void SwASCIIParser::InsertText( const String& rStr )
 530 {
 531     pDoc->InsertString( *pPam, rStr );
 532     if( pItemSet && pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
 533                                              SCRIPTTYPE_ASIAN |
 534                                              SCRIPTTYPE_COMPLEX ) )
 535         nScript |= pBreakIt->GetAllScriptsOfText( rStr );
 536 }
 537
 538 /* vi:set tabstop=4 shiftwidth=4 expandtab: */