1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <rtl/ustrbuf.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
20 #include "LuceneHelper.hxx"
22 using namespace lucene::document
;
24 HelpIndexer::HelpIndexer(OUString
const &lang
, OUString
const &module
,
25 OUString
const &srcDir
, OUString
const &outDir
)
26 : d_lang(lang
), d_module(module
)
28 d_indexDir
= OUStringBuffer(outDir
).append('/').
29 append(module
).append(".idxl").makeStringAndClear();
30 d_captionDir
= srcDir
+ "/caption";
31 d_contentDir
= srcDir
+ "/content";
34 bool HelpIndexer::indexDocuments()
41 OUString sLang
= d_lang
.getToken(0, '-');
42 bool bUseCJK
= sLang
== "ja" || sLang
== "ko" || sLang
== "zh";
44 // Construct the analyzer appropriate for the given language
45 std::unique_ptr
<lucene::analysis::Analyzer
> analyzer
;
47 analyzer
.reset(new lucene::analysis::LanguageBasedAnalyzer(L
"cjk"));
49 analyzer
.reset(new lucene::analysis::standard::StandardAnalyzer());
51 OUString ustrSystemPath
;
52 osl::File::getSystemPathFromFileURL(d_indexDir
, ustrSystemPath
);
54 OString indexDirStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
55 lucene::index::IndexWriter
writer(indexDirStr
.getStr(), analyzer
.get(), true);
56 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
57 //exception for ja help. Could alternative ignore the exception and get
58 //truncated results as per java-Lucene apparently
59 writer
.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH
*2);
61 // Index the identified help files
63 for (std::set
<OUString
>::iterator i
= d_files
.begin(); i
!= d_files
.end(); ++i
) {
64 helpDocument(*i
, &doc
);
65 writer
.addDocument(&doc
);
73 catch (CLuceneError
&e
)
75 d_error
= OUString::createFromAscii(e
.what());
83 bool HelpIndexer::scanForFiles() {
84 if (!scanForFiles(d_contentDir
)) {
87 if (!scanForFiles(d_captionDir
)) {
93 bool HelpIndexer::scanForFiles(OUString
const & path
) {
95 osl::Directory
dir(path
);
96 if (osl::FileBase::E_None
!= dir
.open()) {
97 d_error
= "Error reading directory " + path
;
101 osl::DirectoryItem item
;
102 osl::FileStatus
fileStatus(osl_FileStatus_Mask_FileName
| osl_FileStatus_Mask_Type
);
103 while (dir
.getNextItem(item
) == osl::FileBase::E_None
) {
104 item
.getFileStatus(fileStatus
);
105 if (fileStatus
.getFileType() == osl::FileStatus::Regular
) {
106 d_files
.insert(fileStatus
.getFileName());
113 void HelpIndexer::helpDocument(OUString
const & fileName
, Document
*doc
) {
114 // Add the help path as an indexed, untokenized field.
116 OUString path
= "#HLP#" + d_module
+ "/" + fileName
;
117 std::vector
<TCHAR
> aPath(OUStringToTCHARVec(path
));
118 doc
->add(*_CLNEW
Field(_T("path"), &aPath
[0], Field::STORE_YES
| Field::INDEX_UNTOKENIZED
));
120 OUString sEscapedFileName
=
121 rtl::Uri::encode(fileName
,
122 rtl_UriCharClassUric
, rtl_UriEncodeIgnoreEscapes
, RTL_TEXTENCODING_UTF8
);
124 // Add the caption as a field.
125 OUString captionPath
= d_captionDir
+ "/" + sEscapedFileName
;
126 doc
->add(*_CLNEW
Field(_T("caption"), helpFileReader(captionPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
128 // Add the content as a field.
129 OUString contentPath
= d_contentDir
+ "/" + sEscapedFileName
;
130 doc
->add(*_CLNEW
Field(_T("content"), helpFileReader(contentPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
133 lucene::util::Reader
*HelpIndexer::helpFileReader(OUString
const & path
) {
134 osl::File
file(path
);
135 if (osl::FileBase::E_None
== file
.open(osl_File_OpenFlag_Read
)) {
137 OUString ustrSystemPath
;
138 osl::File::getSystemPathFromFileURL(path
, ustrSystemPath
);
139 OString pathStr
= OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
140 return _CLNEW
lucene::util::FileReader(pathStr
.getStr(), "UTF-8");
142 return _CLNEW
lucene::util::StringReader(L
"");
146 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */