update credits
[LibreOffice.git] / helpcompiler / source / HelpIndexer.cxx
blobb3508a7f88828142ad1951d5b340b748b0674159
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <helpcompiler/HelpIndexer.hxx>
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <rtl/ustrbuf.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
17 #include <boost/scoped_ptr.hpp>
18 #include <algorithm>
20 #include "LuceneHelper.hxx"
22 using namespace lucene::document;
24 HelpIndexer::HelpIndexer(OUString const &lang, OUString const &module,
25 OUString const &srcDir, OUString const &outDir)
26 : d_lang(lang), d_module(module)
28 d_indexDir = OUStringBuffer(outDir).append('/').
29 append(module).appendAscii(RTL_CONSTASCII_STRINGPARAM(".idxl")).toString();
30 d_captionDir = srcDir + OUString("/caption");
31 d_contentDir = srcDir + OUString("/content");
34 bool HelpIndexer::indexDocuments()
36 if (!scanForFiles())
37 return false;
39 try
41 OUString sLang = d_lang.getToken(0, '-');
42 bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
44 // Construct the analyzer appropriate for the given language
45 boost::scoped_ptr<lucene::analysis::Analyzer> analyzer;
46 if (bUseCJK)
47 analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
48 else
49 analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
51 OUString ustrSystemPath;
52 osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
54 OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
55 lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer.get(), true);
56 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
57 //exception for ja help. Could alternative ignore the exception and get
58 //truncated results as per java-Lucene apparently
59 writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
61 // Index the identified help files
62 Document doc;
63 for (std::set<OUString>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
64 helpDocument(*i, &doc);
65 writer.addDocument(&doc);
66 doc.clear();
68 writer.optimize();
70 // Optimize the index
71 writer.optimize();
73 catch (CLuceneError &e)
75 d_error = OUString::createFromAscii(e.what());
76 return false;
79 return true;
82 OUString const & HelpIndexer::getErrorMessage() {
83 return d_error;
86 bool HelpIndexer::scanForFiles() {
87 if (!scanForFiles(d_contentDir)) {
88 return false;
90 if (!scanForFiles(d_captionDir)) {
91 return false;
93 return true;
96 bool HelpIndexer::scanForFiles(OUString const & path) {
98 osl::Directory dir(path);
99 if (osl::FileBase::E_None != dir.open()) {
100 d_error = OUString("Error reading directory ") + path;
101 return true;
104 osl::DirectoryItem item;
105 osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
106 while (dir.getNextItem(item) == osl::FileBase::E_None) {
107 item.getFileStatus(fileStatus);
108 if (fileStatus.getFileType() == osl::FileStatus::Regular) {
109 d_files.insert(fileStatus.getFileName());
113 return true;
116 bool HelpIndexer::helpDocument(OUString const & fileName, Document *doc) {
117 // Add the help path as an indexed, untokenized field.
119 OUString path = OUString("#HLP#") +
120 d_module + OUString("/") + fileName;
121 std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
122 doc->add(*_CLNEW Field(_T("path"), &aPath[0], Field::STORE_YES | Field::INDEX_UNTOKENIZED));
124 OUString sEscapedFileName =
125 rtl::Uri::encode(fileName,
126 rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
128 // Add the caption as a field.
129 OUString captionPath = d_captionDir + OUString("/") + sEscapedFileName;
130 doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
132 // Add the content as a field.
133 OUString contentPath = d_contentDir + OUString("/") + sEscapedFileName;
134 doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
136 return true;
139 lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
140 osl::File file(path);
141 if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
142 file.close();
143 OUString ustrSystemPath;
144 osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
145 OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
146 return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
147 } else {
148 return _CLNEW lucene::util::StringReader(L"");
152 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */