1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <o3tl/runtimetooustring.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
19 #include "LuceneHelper.hxx"
21 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
23 using namespace lucene::document
;
25 HelpIndexer::HelpIndexer(OUString
const &lang
, OUString
const &module
,
26 OUString
const &srcDir
, OUString
const &outDir
)
27 : d_lang(lang
), d_module(module
)
29 d_indexDir
= outDir
+ OUStringChar('/') + module
+ ".idxl";
30 d_captionDir
= srcDir
+ "/caption";
31 d_contentDir
= srcDir
+ "/content";
34 bool HelpIndexer::indexDocuments()
41 OUString sLang
= d_lang
.getToken(0, '-');
42 bool bUseCJK
= sLang
== "ja" || sLang
== "ko" || sLang
== "zh";
44 // Construct the analyzer appropriate for the given language
45 std::unique_ptr
<lucene::analysis::Analyzer
> analyzer
;
47 analyzer
.reset(new lucene::analysis::LanguageBasedAnalyzer(L
"cjk"));
49 analyzer
.reset(new lucene::analysis::standard::StandardAnalyzer());
51 OUString ustrSystemPath
;
52 osl::File::getSystemPathFromFileURL(d_indexDir
, ustrSystemPath
);
54 OString indexDirStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
55 lucene::index::IndexWriter
writer(indexDirStr
.getStr(), analyzer
.get(), true);
56 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
57 //exception for ja help. Could alternative ignore the exception and get
58 //truncated results as per java-Lucene apparently
59 writer
.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH
*2);
61 // Index the identified help files
63 for (auto const& elem
: d_files
)
65 helpDocument(elem
, &doc
);
66 writer
.addDocument(&doc
);
74 catch (CLuceneError
&e
)
76 d_error
= o3tl::runtimeToOUString(e
.what());
84 bool HelpIndexer::scanForFiles() {
85 if (!scanForFiles(d_contentDir
)) {
88 if (!scanForFiles(d_captionDir
)) {
94 bool HelpIndexer::scanForFiles(OUString
const & path
) {
96 osl::Directory
dir(path
);
97 if (osl::FileBase::E_None
!= dir
.open()) {
98 d_error
= "Error reading directory " + path
;
102 osl::DirectoryItem item
;
103 osl::FileStatus
fileStatus(osl_FileStatus_Mask_FileName
| osl_FileStatus_Mask_Type
);
104 while (dir
.getNextItem(item
) == osl::FileBase::E_None
) {
105 item
.getFileStatus(fileStatus
);
106 if (fileStatus
.getFileType() == osl::FileStatus::Regular
) {
107 d_files
.insert(fileStatus
.getFileName());
114 void HelpIndexer::helpDocument(OUString
const & fileName
, Document
*doc
) const {
115 // Add the help path as an indexed, untokenized field.
117 OUString path
= "#HLP#" + d_module
+ "/" + fileName
;
118 std::vector
<TCHAR
> aPath(OUStringToTCHARVec(path
));
119 doc
->add(*_CLNEW
Field(_T("path"), aPath
.data(), Field::STORE_YES
| Field::INDEX_UNTOKENIZED
));
121 OUString sEscapedFileName
=
122 rtl::Uri::encode(fileName
,
123 rtl_UriCharClassUric
, rtl_UriEncodeIgnoreEscapes
, RTL_TEXTENCODING_UTF8
);
125 // Add the caption as a field.
126 OUString captionPath
= d_captionDir
+ "/" + sEscapedFileName
;
127 doc
->add(*_CLNEW
Field(_T("caption"), helpFileReader(captionPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
129 // Add the content as a field.
130 OUString contentPath
= d_contentDir
+ "/" + sEscapedFileName
;
131 doc
->add(*_CLNEW
Field(_T("content"), helpFileReader(contentPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
134 lucene::util::Reader
*HelpIndexer::helpFileReader(OUString
const & path
) {
135 osl::File
file(path
);
136 if (osl::FileBase::E_None
== file
.open(osl_File_OpenFlag_Read
)) {
138 OUString ustrSystemPath
;
139 osl::File::getSystemPathFromFileURL(path
, ustrSystemPath
);
140 OString pathStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
141 return _CLNEW
lucene::util::FileReader(pathStr
.getStr(), "UTF-8");
143 return _CLNEW
lucene::util::StringReader(L
"");
147 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */