helpcompiler/source/HelpIndexer.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  */
   9
  10 #include <helpcompiler/HelpIndexer.hxx>
  11
  12 #include <rtl/string.hxx>
  13 #include <rtl/uri.hxx>
  14 #include <o3tl/runtimetooustring.hxx>
  15 #include <osl/file.hxx>
  16 #include <osl/thread.h>
  17 #include <memory>
  18
  19 #include "LuceneHelper.hxx"
  20 #include <CLucene.h>
  21 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
  22
  23 #if defined _WIN32
  24 #include <o3tl/char16_t2wchar_t.hxx>
  25 #include <prewin.h>
  26 #include <postwin.h>
  27 #endif
  28
  29 using namespace lucene::document;
  30
  31 HelpIndexer::HelpIndexer(OUString const &lang, OUString const &module,
  32     std::u16string_view srcDir, std::u16string_view outDir)
  33     : d_lang(lang), d_module(module)
  34 {
  35     d_indexDir = outDir + OUStringChar('/') + module + ".idxl";
  36     d_captionDir = OUString::Concat(srcDir) + "/caption";
  37     d_contentDir = OUString::Concat(srcDir) + "/content";
  38 }
  39
  40 #if defined _WIN32
  41 namespace
  42 {
  43 template <class Constructor>
  44 auto TryWithUnicodePathWorkaround(const OUString& ustrPath, const Constructor& constructor)
  45 {
  46     const rtl_TextEncoding eThreadEncoding = osl_getThreadTextEncoding();
  47     OString sPath = OUStringToOString(ustrPath, eThreadEncoding);
  48     try
  49     {
  50         // First try path in thread encoding (ACP in case of Windows).
  51         return constructor(sPath);
  52     }
  53     catch (const CLuceneError&)
  54     {
  55         // Maybe the path contains characters not representable in ACP. There's no API in lucene
  56         // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
  57         // functions without conversion).
  58
  59         // For a workaround, try short name, which should only contain ASCII characters. Would
  60         // not help (i.e., would return original long name) if short (8.3) file name creation is
  61         // disabled in OS or volume settings.
  62         wchar_t buf[32767];
  63         if (GetShortPathNameW(o3tl::toW(ustrPath.getStr()), buf, std::size(buf)) == 0)
  64             throw;
  65         sPath = OUStringToOString(o3tl::toU(buf), eThreadEncoding);
  66         return constructor(sPath);
  67     }
  68 }
  69 }
  70 #endif
  71
  72 bool HelpIndexer::indexDocuments()
  73 {
  74     if (!scanForFiles())
  75         return false;
  76
  77     try
  78     {
  79         OUString sLang = d_lang.getToken(0, '-');
  80         bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
  81
  82         // Construct the analyzer appropriate for the given language
  83         std::unique_ptr<lucene::analysis::Analyzer> analyzer;
  84         if (bUseCJK)
  85             analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
  86         else
  87             analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
  88
  89         OUString ustrSystemPath;
  90         osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
  91
  92 #if defined _WIN32
  93         // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
  94         osl::Directory::createPath(d_indexDir);
  95         auto writer = TryWithUnicodePathWorkaround(ustrSystemPath, [&analyzer](const OString& s) {
  96             return std::make_unique<lucene::index::IndexWriter>(s.getStr(), analyzer.get(), true);
  97         });
  98 #else
  99         OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
 100         auto writer = std::make_unique<lucene::index::IndexWriter>(indexDirStr.getStr(),
 101                                                                    analyzer.get(), true);
 102 #endif
 103
 104         //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
 105         //exception for ja help. Could alternative ignore the exception and get
 106         //truncated results as per java-Lucene apparently
 107         writer->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
 108
 109         // Index the identified help files
 110         Document doc;
 111         for (auto const& elem : d_files)
 112         {
 113             helpDocument(elem, &doc);
 114             writer->addDocument(&doc);
 115             doc.clear();
 116         }
 117
 118         // Optimize the index
 119         writer->optimize();
 120     }
 121     catch (CLuceneError &e)
 122     {
 123         d_error = o3tl::runtimeToOUString(e.what());
 124         return false;
 125     }
 126
 127     return true;
 128 }
 129
 130
 131 bool HelpIndexer::scanForFiles() {
 132     if (!scanForFiles(d_contentDir)) {
 133         return false;
 134     }
 135     if (!scanForFiles(d_captionDir)) {
 136         return false;
 137     }
 138     return true;
 139 }
 140
 141 bool HelpIndexer::scanForFiles(OUString const & path) {
 142
 143     osl::Directory dir(path);
 144     if (osl::FileBase::E_None != dir.open()) {
 145         d_error = "Error reading directory " + path;
 146         return false;
 147     }
 148
 149     osl::DirectoryItem item;
 150     osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
 151     while (dir.getNextItem(item) == osl::FileBase::E_None) {
 152         item.getFileStatus(fileStatus);
 153         if (fileStatus.getFileType() == osl::FileStatus::Regular) {
 154             d_files.insert(fileStatus.getFileName());
 155         }
 156     }
 157
 158     return true;
 159 }
 160
 161 void HelpIndexer::helpDocument(OUString const & fileName, Document *doc) const {
 162     // Add the help path as an indexed, untokenized field.
 163
 164     OUString path = "#HLP#" + d_module + "/" + fileName;
 165     std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
 166     doc->add(*_CLNEW Field(_T("path"), aPath.data(), int(Field::STORE_YES) | int(Field::INDEX_UNTOKENIZED)));
 167
 168     OUString sEscapedFileName =
 169         rtl::Uri::encode(fileName,
 170         rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
 171
 172     // Add the caption as a field.
 173     OUString captionPath = d_captionDir + "/" + sEscapedFileName;
 174     doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
 175
 176     // Add the content as a field.
 177     OUString contentPath = d_contentDir + "/" + sEscapedFileName;
 178     doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
 179 }
 180
 181 lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
 182     osl::File file(path);
 183     if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
 184         file.close();
 185         OUString ustrSystemPath;
 186         osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
 187 #if defined _WIN32
 188         return TryWithUnicodePathWorkaround(ustrSystemPath, [](const OString& s) {
 189             return _CLNEW lucene::util::FileReader(s.getStr(), "UTF-8");
 190         });
 191 #else
 192         OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
 193         return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
 194 #endif
 195     } else {
 196         return _CLNEW lucene::util::StringReader(L"");
 197     }
 198 }
 199
 200 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */