1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <o3tl/runtimetooustring.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
17 #include <o3tl/string_view.hxx>
21 #include "LuceneHelper.hxx"
23 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
26 #include <o3tl/char16_t2wchar_t.hxx>
31 using namespace lucene::document
;
33 HelpIndexer::HelpIndexer(OUString lang
, OUString module
,
34 std::u16string_view srcDir
, std::u16string_view outDir
)
35 : d_lang(std::move(lang
)), d_module(std::move(module
))
37 d_indexDir
= outDir
+ OUStringChar('/') + d_module
+ ".idxl";
38 osl_getAbsoluteFileURL(nullptr, d_indexDir
.pData
, &d_indexDir
.pData
);
39 d_captionDir
= OUString::Concat(srcDir
) + "/caption";
40 osl_getAbsoluteFileURL(nullptr, d_captionDir
.pData
, &d_captionDir
.pData
);
41 d_contentDir
= OUString::Concat(srcDir
) + "/content";
42 osl_getAbsoluteFileURL(nullptr, d_contentDir
.pData
, &d_contentDir
.pData
);
48 template <class Constructor
>
49 auto TryWithUnicodePathWorkaround(const OUString
& ustrPath
, const Constructor
& constructor
)
51 const rtl_TextEncoding eThreadEncoding
= osl_getThreadTextEncoding();
52 OString sPath
= OUStringToOString(ustrPath
, eThreadEncoding
);
55 // First try path in thread encoding (ACP in case of Windows).
56 return constructor(sPath
);
58 catch (const CLuceneError
&)
60 // Maybe the path contains characters not representable in ACP. There's no API in lucene
61 // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
62 // functions without conversion).
64 // For a workaround, try short name, which should only contain ASCII characters. Would
65 // not help (i.e., would return original long name) if short (8.3) file name creation is
66 // disabled in OS or volume settings.
68 if (GetShortPathNameW(o3tl::toW(ustrPath
.getStr()), buf
, std::size(buf
)) == 0)
70 sPath
= OUStringToOString(o3tl::toU(buf
), eThreadEncoding
);
71 return constructor(sPath
);
77 bool HelpIndexer::indexDocuments()
84 std::u16string_view sLang
= o3tl::getToken(d_lang
, 0, '-');
85 bool bUseCJK
= sLang
== u
"ja" || sLang
== u
"ko" || sLang
== u
"zh";
87 // Construct the analyzer appropriate for the given language
88 std::unique_ptr
<lucene::analysis::Analyzer
> analyzer
;
90 analyzer
.reset(new lucene::analysis::LanguageBasedAnalyzer(L
"cjk"));
92 analyzer
.reset(new lucene::analysis::standard::StandardAnalyzer());
94 OUString ustrSystemPath
;
95 osl::File::getSystemPathFromFileURL(d_indexDir
, ustrSystemPath
);
98 // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
99 osl::Directory::createPath(d_indexDir
);
100 auto writer
= TryWithUnicodePathWorkaround(ustrSystemPath
, [&analyzer
](const OString
& s
) {
101 return std::make_unique
<lucene::index::IndexWriter
>(s
.getStr(), analyzer
.get(), true);
104 OString indexDirStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
105 auto writer
= std::make_unique
<lucene::index::IndexWriter
>(indexDirStr
.getStr(),
106 analyzer
.get(), true);
109 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
110 //exception for ja help. Could alternative ignore the exception and get
111 //truncated results as per java-Lucene apparently
112 writer
->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH
*2);
114 // Index the identified help files
116 for (auto const& elem
: d_files
)
118 helpDocument(elem
, &doc
);
119 writer
->addDocument(&doc
);
123 // Optimize the index
126 catch (CLuceneError
&e
)
128 d_error
= o3tl::runtimeToOUString(e
.what());
136 bool HelpIndexer::scanForFiles() {
137 if (!scanForFiles(d_contentDir
)) {
140 if (!scanForFiles(d_captionDir
)) {
146 bool HelpIndexer::scanForFiles(OUString
const & path
) {
148 osl::Directory
dir(path
);
149 if (osl::FileBase::E_None
!= dir
.open()) {
150 d_error
= "Error reading directory " + path
;
154 osl::DirectoryItem item
;
155 osl::FileStatus
fileStatus(osl_FileStatus_Mask_FileName
| osl_FileStatus_Mask_Type
);
156 while (dir
.getNextItem(item
) == osl::FileBase::E_None
) {
157 item
.getFileStatus(fileStatus
);
158 if (fileStatus
.getFileType() == osl::FileStatus::Regular
) {
159 d_files
.insert(fileStatus
.getFileName());
166 void HelpIndexer::helpDocument(OUString
const & fileName
, Document
*doc
) const {
167 // Add the help path as an indexed, untokenized field.
169 OUString path
= "#HLP#" + d_module
+ "/" + fileName
;
170 std::vector
<TCHAR
> aPath(OUStringToTCHARVec(path
));
171 doc
->add(*_CLNEW
Field(_T("path"), aPath
.data(), int(Field::STORE_YES
) | int(Field::INDEX_UNTOKENIZED
)));
173 OUString sEscapedFileName
=
174 rtl::Uri::encode(fileName
,
175 rtl_UriCharClassUric
, rtl_UriEncodeIgnoreEscapes
, RTL_TEXTENCODING_UTF8
);
177 // Add the caption as a field.
178 OUString captionPath
= d_captionDir
+ "/" + sEscapedFileName
;
179 doc
->add(*_CLNEW
Field(_T("caption"), helpFileReader(captionPath
), int(Field::STORE_NO
) | int(Field::INDEX_TOKENIZED
)));
181 // Add the content as a field.
182 OUString contentPath
= d_contentDir
+ "/" + sEscapedFileName
;
183 doc
->add(*_CLNEW
Field(_T("content"), helpFileReader(contentPath
), int(Field::STORE_NO
) | int(Field::INDEX_TOKENIZED
)));
186 lucene::util::Reader
*HelpIndexer::helpFileReader(OUString
const & path
) {
187 osl::File
file(path
);
188 if (osl::FileBase::E_None
== file
.open(osl_File_OpenFlag_Read
)) {
190 OUString ustrSystemPath
;
191 osl::File::getSystemPathFromFileURL(path
, ustrSystemPath
);
193 return TryWithUnicodePathWorkaround(ustrSystemPath
, [](const OString
& s
) {
194 return _CLNEW
lucene::util::FileReader(s
.getStr(), "UTF-8");
197 OString pathStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
198 return _CLNEW
lucene::util::FileReader(pathStr
.getStr(), "UTF-8");
201 return _CLNEW
lucene::util::StringReader(L
"");
205 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */