1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <o3tl/runtimetooustring.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
19 #include "LuceneHelper.hxx"
21 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
24 #include <o3tl/char16_t2wchar_t.hxx>
29 using namespace lucene::document
;
31 HelpIndexer::HelpIndexer(OUString
const &lang
, OUString
const &module
,
32 std::u16string_view srcDir
, std::u16string_view outDir
)
33 : d_lang(lang
), d_module(module
)
35 d_indexDir
= outDir
+ OUStringChar('/') + module
+ ".idxl";
36 d_captionDir
= OUString::Concat(srcDir
) + "/caption";
37 d_contentDir
= OUString::Concat(srcDir
) + "/content";
43 template <class Constructor
>
44 auto TryWithUnicodePathWorkaround(const OUString
& ustrPath
, const Constructor
& constructor
)
46 const rtl_TextEncoding eThreadEncoding
= osl_getThreadTextEncoding();
47 OString sPath
= OUStringToOString(ustrPath
, eThreadEncoding
);
50 // First try path in thread encoding (ACP in case of Windows).
51 return constructor(sPath
);
53 catch (const CLuceneError
&)
55 // Maybe the path contains characters not representable in ACP. There's no API in lucene
56 // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
57 // functions without conversion).
59 // For a workaround, try short name, which should only contain ASCII characters. Would
60 // not help (i.e., would return original long name) if short (8.3) file name creation is
61 // disabled in OS or volume settings.
63 if (GetShortPathNameW(o3tl::toW(ustrPath
.getStr()), buf
, std::size(buf
)) == 0)
65 sPath
= OUStringToOString(o3tl::toU(buf
), eThreadEncoding
);
66 return constructor(sPath
);
72 bool HelpIndexer::indexDocuments()
79 OUString sLang
= d_lang
.getToken(0, '-');
80 bool bUseCJK
= sLang
== "ja" || sLang
== "ko" || sLang
== "zh";
82 // Construct the analyzer appropriate for the given language
83 std::unique_ptr
<lucene::analysis::Analyzer
> analyzer
;
85 analyzer
.reset(new lucene::analysis::LanguageBasedAnalyzer(L
"cjk"));
87 analyzer
.reset(new lucene::analysis::standard::StandardAnalyzer());
89 OUString ustrSystemPath
;
90 osl::File::getSystemPathFromFileURL(d_indexDir
, ustrSystemPath
);
93 // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
94 osl::Directory::createPath(d_indexDir
);
95 auto writer
= TryWithUnicodePathWorkaround(ustrSystemPath
, [&analyzer
](const OString
& s
) {
96 return std::make_unique
<lucene::index::IndexWriter
>(s
.getStr(), analyzer
.get(), true);
99 OString indexDirStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
100 auto writer
= std::make_unique
<lucene::index::IndexWriter
>(indexDirStr
.getStr(),
101 analyzer
.get(), true);
104 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
105 //exception for ja help. Could alternative ignore the exception and get
106 //truncated results as per java-Lucene apparently
107 writer
->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH
*2);
109 // Index the identified help files
111 for (auto const& elem
: d_files
)
113 helpDocument(elem
, &doc
);
114 writer
->addDocument(&doc
);
118 // Optimize the index
121 catch (CLuceneError
&e
)
123 d_error
= o3tl::runtimeToOUString(e
.what());
131 bool HelpIndexer::scanForFiles() {
132 if (!scanForFiles(d_contentDir
)) {
135 if (!scanForFiles(d_captionDir
)) {
141 bool HelpIndexer::scanForFiles(OUString
const & path
) {
143 osl::Directory
dir(path
);
144 if (osl::FileBase::E_None
!= dir
.open()) {
145 d_error
= "Error reading directory " + path
;
149 osl::DirectoryItem item
;
150 osl::FileStatus
fileStatus(osl_FileStatus_Mask_FileName
| osl_FileStatus_Mask_Type
);
151 while (dir
.getNextItem(item
) == osl::FileBase::E_None
) {
152 item
.getFileStatus(fileStatus
);
153 if (fileStatus
.getFileType() == osl::FileStatus::Regular
) {
154 d_files
.insert(fileStatus
.getFileName());
161 void HelpIndexer::helpDocument(OUString
const & fileName
, Document
*doc
) const {
162 // Add the help path as an indexed, untokenized field.
164 OUString path
= "#HLP#" + d_module
+ "/" + fileName
;
165 std::vector
<TCHAR
> aPath(OUStringToTCHARVec(path
));
166 doc
->add(*_CLNEW
Field(_T("path"), aPath
.data(), int(Field::STORE_YES
) | int(Field::INDEX_UNTOKENIZED
)));
168 OUString sEscapedFileName
=
169 rtl::Uri::encode(fileName
,
170 rtl_UriCharClassUric
, rtl_UriEncodeIgnoreEscapes
, RTL_TEXTENCODING_UTF8
);
172 // Add the caption as a field.
173 OUString captionPath
= d_captionDir
+ "/" + sEscapedFileName
;
174 doc
->add(*_CLNEW
Field(_T("caption"), helpFileReader(captionPath
), int(Field::STORE_NO
) | int(Field::INDEX_TOKENIZED
)));
176 // Add the content as a field.
177 OUString contentPath
= d_contentDir
+ "/" + sEscapedFileName
;
178 doc
->add(*_CLNEW
Field(_T("content"), helpFileReader(contentPath
), int(Field::STORE_NO
) | int(Field::INDEX_TOKENIZED
)));
181 lucene::util::Reader
*HelpIndexer::helpFileReader(OUString
const & path
) {
182 osl::File
file(path
);
183 if (osl::FileBase::E_None
== file
.open(osl_File_OpenFlag_Read
)) {
185 OUString ustrSystemPath
;
186 osl::File::getSystemPathFromFileURL(path
, ustrSystemPath
);
188 return TryWithUnicodePathWorkaround(ustrSystemPath
, [](const OString
& s
) {
189 return _CLNEW
lucene::util::FileReader(s
.getStr(), "UTF-8");
192 OString pathStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
193 return _CLNEW
lucene::util::FileReader(pathStr
.getStr(), "UTF-8");
196 return _CLNEW
lucene::util::StringReader(L
"");
200 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */