Update git submodules
[LibreOffice.git] / helpcompiler / source / HelpIndexer.cxx
blob65e46743b482ed3cbaf5c92944a96d2de1e00178
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <o3tl/runtimetooustring.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
17 #include <o3tl/string_view.hxx>
18 #include <memory>
19 #include <utility>
21 #include "LuceneHelper.hxx"
22 #include <CLucene.h>
23 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
25 #if defined _WIN32
26 #include <o3tl/char16_t2wchar_t.hxx>
27 #include <prewin.h>
28 #include <postwin.h>
29 #endif
31 using namespace lucene::document;
33 HelpIndexer::HelpIndexer(OUString lang, OUString module,
34 std::u16string_view srcDir, std::u16string_view outDir)
35 : d_lang(std::move(lang)), d_module(std::move(module))
37 d_indexDir = outDir + OUStringChar('/') + d_module + ".idxl";
38 osl_getAbsoluteFileURL(nullptr, d_indexDir.pData, &d_indexDir.pData);
39 d_captionDir = OUString::Concat(srcDir) + "/caption";
40 osl_getAbsoluteFileURL(nullptr, d_captionDir.pData, &d_captionDir.pData);
41 d_contentDir = OUString::Concat(srcDir) + "/content";
42 osl_getAbsoluteFileURL(nullptr, d_contentDir.pData, &d_contentDir.pData);
45 #if defined _WIN32
46 namespace
48 template <class Constructor>
49 auto TryWithUnicodePathWorkaround(const OUString& ustrPath, const Constructor& constructor)
51 const rtl_TextEncoding eThreadEncoding = osl_getThreadTextEncoding();
52 OString sPath = OUStringToOString(ustrPath, eThreadEncoding);
53 try
55 // First try path in thread encoding (ACP in case of Windows).
56 return constructor(sPath);
58 catch (const CLuceneError&)
60 // Maybe the path contains characters not representable in ACP. There's no API in lucene
61 // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
62 // functions without conversion).
64 // For a workaround, try short name, which should only contain ASCII characters. Would
65 // not help (i.e., would return original long name) if short (8.3) file name creation is
66 // disabled in OS or volume settings.
67 wchar_t buf[32767];
68 if (GetShortPathNameW(o3tl::toW(ustrPath.getStr()), buf, std::size(buf)) == 0)
69 throw;
70 sPath = OUStringToOString(o3tl::toU(buf), eThreadEncoding);
71 return constructor(sPath);
75 #endif
77 bool HelpIndexer::indexDocuments()
79 if (!scanForFiles())
80 return false;
82 try
84 std::u16string_view sLang = o3tl::getToken(d_lang, 0, '-');
85 bool bUseCJK = sLang == u"ja" || sLang == u"ko" || sLang == u"zh";
87 // Construct the analyzer appropriate for the given language
88 std::unique_ptr<lucene::analysis::Analyzer> analyzer;
89 if (bUseCJK)
90 analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
91 else
92 analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
94 OUString ustrSystemPath;
95 osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
97 #if defined _WIN32
98 // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
99 osl::Directory::createPath(d_indexDir);
100 auto writer = TryWithUnicodePathWorkaround(ustrSystemPath, [&analyzer](const OString& s) {
101 return std::make_unique<lucene::index::IndexWriter>(s.getStr(), analyzer.get(), true);
103 #else
104 OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
105 auto writer = std::make_unique<lucene::index::IndexWriter>(indexDirStr.getStr(),
106 analyzer.get(), true);
107 #endif
109 #ifndef SYSTEM_CLUCENE
110 // avoid random values in index file, making help indices reproducible
111 writer->setSegmentInfoStartVersion(0);
112 #endif
114 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
115 //exception for ja help. Could alternative ignore the exception and get
116 //truncated results as per java-Lucene apparently
117 writer->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
119 // Index the identified help files
120 Document doc;
121 for (auto const& elem : d_files)
123 helpDocument(elem, &doc);
124 writer->addDocument(&doc);
125 doc.clear();
128 // Optimize the index
129 writer->optimize();
131 catch (CLuceneError &e)
133 d_error = o3tl::runtimeToOUString(e.what());
134 return false;
137 return true;
141 bool HelpIndexer::scanForFiles() {
142 if (!scanForFiles(d_contentDir)) {
143 return false;
145 if (!scanForFiles(d_captionDir)) {
146 return false;
148 return true;
151 bool HelpIndexer::scanForFiles(OUString const & path) {
153 osl::Directory dir(path);
154 if (osl::FileBase::E_None != dir.open()) {
155 d_error = "Error reading directory " + path;
156 return false;
159 osl::DirectoryItem item;
160 osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
161 while (dir.getNextItem(item) == osl::FileBase::E_None) {
162 item.getFileStatus(fileStatus);
163 if (fileStatus.getFileType() == osl::FileStatus::Regular) {
164 d_files.insert(fileStatus.getFileName());
168 return true;
171 void HelpIndexer::helpDocument(OUString const & fileName, Document *doc) const {
172 // Add the help path as an indexed, untokenized field.
174 OUString path = "#HLP#" + d_module + "/" + fileName;
175 std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
176 doc->add(*_CLNEW Field(_T("path"), aPath.data(), int(Field::STORE_YES) | int(Field::INDEX_UNTOKENIZED)));
178 OUString sEscapedFileName =
179 rtl::Uri::encode(fileName,
180 rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
182 // Add the caption as a field.
183 OUString captionPath = d_captionDir + "/" + sEscapedFileName;
184 doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
186 // Add the content as a field.
187 OUString contentPath = d_contentDir + "/" + sEscapedFileName;
188 doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
191 lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
192 osl::File file(path);
193 if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
194 file.close();
195 OUString ustrSystemPath;
196 osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
197 #if defined _WIN32
198 return TryWithUnicodePathWorkaround(ustrSystemPath, [](const OString& s) {
199 return _CLNEW lucene::util::FileReader(s.getStr(), "UTF-8");
201 #else
202 OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
203 return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
204 #endif
205 } else {
206 return _CLNEW lucene::util::StringReader(L"");
210 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */