xapian-applications/omega/opendocmetaparser.cc

   1 /** @file
   2  * @brief Parser for OpenDocument's meta.xml.
   3  *
   4  * Also used for MSXML's docProps/core.xml.
   5  */
   6 /* Copyright (C) 2006,2009,2010,2011,2013,2015,2020,2022 Olly Betts
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "opendocmetaparser.h"
  26
  27 #include "datetime.h"
  28 #include "parseint.h"
  29
  30 using namespace std;
  31
  32 void
  33 OpenDocMetaParser::process_content(const string& content)
  34 {
  35     switch (field) {
  36         case KEYWORDS:
  37             if (!keywords.empty()) keywords += ' ';
  38             keywords += content;
  39             break;
  40         case TITLE:
  41             if (!title.empty()) title += ' ';
  42             title += content;
  43             break;
  44         case SAMPLE:
  45             if (!sample.empty()) sample += ' ';
  46             sample += content;
  47             break;
  48         case AUTHOR:
  49             if (!author.empty()) author += ' ';
  50             author += content;
  51             break;
  52         case CREATED: {
  53             // E.g. 2013-03-04T22:57:00
  54             created = parse_datetime(content);
  55             break;
  56         }
  57         case NONE:
  58             // Ignore other fields.
  59             break;
  60     }
  61 }
  62
  63 bool
  64 OpenDocMetaParser::opening_tag(const string& tag)
  65 {
  66     if (tag.size() < 8) return true;
  67     if (tag[0] == 'd' && tag[1] == 'c') {
  68         if (tag == "dc:subject") {
  69             // OpenDocument, MSXML.
  70             //
  71             // dc:subject is "Subject and Keywords":
  72             // "Typically, Subject will be expressed as keywords, key phrases
  73             // or classification codes that describe a topic of the resource."
  74             // OpenOffice uses meta:keywords for keywords - dc:subject
  75             // comes from a text field labelled "Subject".  Let's just treat
  76             // it as more keywords.
  77             field = KEYWORDS;
  78         } else if (tag == "dc:title") {
  79             // OpenDocument, MSXML.
  80             field = TITLE;
  81         } else if (tag == "dc:description") {
  82             // OpenDocument, MSXML.
  83             field = SAMPLE;
  84         } else if (tag == "dc:creator") {
  85             // OpenDocument, MSXML.
  86             field = AUTHOR;
  87         } else if (tag == "dcterms:created") {
  88             // MSXML.
  89             field = CREATED;
  90         }
  91     } else if (tag[0] == 'm') {
  92         if (tag == "meta:keyword") {
  93             // OpenDocument.
  94             //
  95             // e.g.:
  96             // <meta:keywords>
  97             // <meta:keyword>information retrieval</meta:keyword>
  98             // </meta:keywords>
  99             field = KEYWORDS;
 100         } else if (tag == "meta:creation-date") {
 101             // OpenDocument.
 102             field = CREATED;
 103         } else if (tag == "meta:document-statistic") {
 104             // OpenDocument:
 105             //
 106             // The values we want for the page count are to be found as
 107             // attributes of the meta:document-statistic tag (which occurs
 108             // inside <office:meta> but we don't bother to check that).
 109             //
 110             // For text documents, we want the meta:page-count attribute.
 111             //
 112             // For spreadsheets, meta:table-count seems to give the sheet count
 113             // (text documents also have meta:table-count so we check for this
 114             // after meta:page-count).
 115             string value;
 116             if (get_attribute("meta:page-count", value) ||
 117                 get_attribute("meta:table-count", value)) {
 118                 unsigned u_pages;
 119                 if (parse_unsigned(value.c_str(), u_pages))
 120                     pages = int(u_pages);
 121             }
 122         }
 123     } else if (tag[0] == 'c' && tag[1] == 'p') {
 124         if (tag == "cp:keywords") {
 125             // MSXML.
 126             field = KEYWORDS;
 127         }
 128     }
 129     return true;
 130 }
 131
 132 bool
 133 OpenDocMetaParser::closing_tag(const string&)
 134 {
 135     field = NONE;
 136     return true;
 137 }