1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * Version: MPL 1.1 / GPLv3+ / LGPLv3+
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License or as specified alternatively below. You may obtain a copy of
8 * the License at http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * Major Contributor(s):
16 * Copyright (C) 2012 Gert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl>
19 * All Rights Reserved.
21 * For minor contributions see the git repository.
23 * Alternatively, the contents of this file may be used under the terms of
24 * either the GNU General Public License Version 3 or later (the "GPLv3+"), or
25 * the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
26 * in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
27 * instead of those above.
30 #include <l10ntools/HelpIndexer.hxx>
32 #include <rtl/string.hxx>
33 #include <rtl/uri.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <osl/file.hxx>
36 #include <osl/thread.h>
40 #include "LuceneHelper.hxx"
42 using namespace lucene::document
;
44 HelpIndexer::HelpIndexer(rtl::OUString
const &lang
, rtl::OUString
const &module
,
45 rtl::OUString
const &srcDir
, rtl::OUString
const &outDir
)
46 : d_lang(lang
), d_module(module
)
48 d_indexDir
= rtl::OUStringBuffer(outDir
).append('/').
49 append(module
).appendAscii(RTL_CONSTASCII_STRINGPARAM(".idxl")).toString();
50 d_captionDir
= srcDir
+ rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/caption"));
51 d_contentDir
= srcDir
+ rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/content"));
54 bool HelpIndexer::indexDocuments() {
55 if (!scanForFiles()) {
59 rtl::OUString sLang
= d_lang
.getToken(0, '-');
60 bool bUseCJK
= sLang
== "ja" || sLang
== "ko" || sLang
== "zh";
62 // Construct the analyzer appropriate for the given language
63 lucene::analysis::Analyzer
*analyzer
;
65 analyzer
= new lucene::analysis::LanguageBasedAnalyzer(L
"cjk");
67 analyzer
= new lucene::analysis::standard::StandardAnalyzer();
69 rtl::OUString ustrSystemPath
;
70 osl::File::getSystemPathFromFileURL(d_indexDir
, ustrSystemPath
);
72 rtl::OString indexDirStr
= rtl::OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
73 lucene::index::IndexWriter
writer(indexDirStr
.getStr(), analyzer
, true);
74 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
75 //exception for ja help. Could alternative ignore the exception and get
76 //truncated results as per java-Lucene apparently
77 writer
.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH
*2);
79 // Index the identified help files
81 for (std::set
<rtl::OUString
>::iterator i
= d_files
.begin(); i
!= d_files
.end(); ++i
) {
82 helpDocument(*i
, &doc
);
83 writer
.addDocument(&doc
);
95 rtl::OUString
const & HelpIndexer::getErrorMessage() {
99 bool HelpIndexer::scanForFiles() {
100 if (!scanForFiles(d_contentDir
)) {
103 if (!scanForFiles(d_captionDir
)) {
109 bool HelpIndexer::scanForFiles(rtl::OUString
const & path
) {
111 osl::Directory
dir(path
);
112 if (osl::FileBase::E_None
!= dir
.open()) {
113 d_error
= rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Error reading directory ")) + path
;
117 osl::DirectoryItem item
;
118 osl::FileStatus
fileStatus(osl_FileStatus_Mask_FileName
| osl_FileStatus_Mask_Type
);
119 while (dir
.getNextItem(item
) == osl::FileBase::E_None
) {
120 item
.getFileStatus(fileStatus
);
121 if (fileStatus
.getFileType() == osl::FileStatus::Regular
) {
122 d_files
.insert(fileStatus
.getFileName());
129 bool HelpIndexer::helpDocument(rtl::OUString
const & fileName
, Document
*doc
) {
130 // Add the help path as an indexed, untokenized field.
132 rtl::OUString path
= rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("#HLP#")) +
133 d_module
+ rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName
;
134 std::vector
<TCHAR
> aPath(OUStringToTCHARVec(path
));
135 doc
->add(*_CLNEW
Field(_T("path"), &aPath
[0], Field::STORE_YES
| Field::INDEX_UNTOKENIZED
));
137 rtl::OUString sEscapedFileName
=
138 rtl::Uri::encode(fileName
,
139 rtl_UriCharClassUric
, rtl_UriEncodeIgnoreEscapes
, RTL_TEXTENCODING_UTF8
);
141 // Add the caption as a field.
142 rtl::OUString captionPath
= d_captionDir
+ rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName
;
143 doc
->add(*_CLNEW
Field(_T("caption"), helpFileReader(captionPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
145 // Add the content as a field.
146 rtl::OUString contentPath
= d_contentDir
+ rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName
;
147 doc
->add(*_CLNEW
Field(_T("content"), helpFileReader(contentPath
), Field::STORE_NO
| Field::INDEX_TOKENIZED
));
152 lucene::util::Reader
*HelpIndexer::helpFileReader(rtl::OUString
const & path
) {
153 osl::File
file(path
);
154 if (osl::FileBase::E_None
== file
.open(osl_File_OpenFlag_Read
)) {
156 rtl::OUString ustrSystemPath
;
157 osl::File::getSystemPathFromFileURL(path
, ustrSystemPath
);
158 rtl::OString pathStr
= rtl::OUStringToOString(ustrSystemPath
, osl_getThreadTextEncoding());
159 return _CLNEW
lucene::util::FileReader(pathStr
.getStr(), "UTF-8");
161 return _CLNEW
lucene::util::StringReader(L
"");
165 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */