Updated Finnish translation
[rhythmbox.git] / podcast / rb-podcast-parse.c
blob6d67a957a5be8143ff1a333e093b162006ee658e
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
3 * arch-tag: Implementation of podcast parse
5 * Copyright (C) 2005 Renato Araujo Oliveira Filho - INdT <renato.filho@indt.org.br>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
23 #define _XOPEN_SOURCE
24 #define __EXTENSIONS__ /* get strptime */
25 #include <string.h>
26 #include <time.h>
28 #include <libxml/entities.h>
29 #include <libxml/SAX.h>
30 #include <libxml/parserInternals.h>
31 #include <libgnomevfs/gnome-vfs.h>
32 #include <glib/gi18n.h>
33 #include <gtk/gtk.h>
35 #include "rb-debug.h"
36 #include "rb-podcast-parse.h"
38 #define BUFFER_SIZE 256
40 struct RBPoadcastLoadContext
42 guint in_unknown_elt;
43 xmlParserCtxtPtr xmlctx;
44 GString *prop_value;
45 RBPodcastChannel *channel_data;
46 RBPodcastItem *item_data;
48 enum {
49 RB_PODCAST_PARSER_STATE_START,
50 RB_PODCAST_PARSER_STATE_RSS,
51 RB_PODCAST_PARSER_STATE_CHANNEL,
52 RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY,
53 RB_PODCAST_PARSER_STATE_IMG,
54 RB_PODCAST_PARSER_STATE_IMG_PROPERTY,
55 RB_PODCAST_PARSER_STATE_ITEM,
56 RB_PODCAST_PARSER_STATE_ITEM_PROPERTY,
57 RB_PODCAST_PARSER_STATE_END,
58 } state;
61 static gboolean rb_validate_channel_propert (const char *name);
62 static gboolean rb_validate_item_propert (const char *name);
63 static uintmax_t rb_podcast_parse_date (const char* date_str);
64 static gulong rb_podcast_parse_time (const char *time_str);
65 static void rb_podcast_parser_start_element (struct RBPoadcastLoadContext* ctx, const char *name, const char **attrs);
66 static void rb_podcast_parser_end_element (struct RBPoadcastLoadContext* ctx, const char *name);
67 static void rb_podcast_parser_characters (struct RBPoadcastLoadContext* ctx, const char *data, guint len);
68 static void rb_set_channel_value (struct RBPoadcastLoadContext* ctx, const char* name, const char* value);
69 static void rb_set_item_value (struct RBPoadcastLoadContext* ctx, const char* name, const char* value);
71 static RBPodcastItem *
72 rb_podcast_initializa_item ()
74 RBPodcastItem *data = g_new0 (RBPodcastItem, 1);
75 return data;
78 static void
79 rb_set_channel_value (struct RBPoadcastLoadContext *ctx,
80 const char *name,
81 const char *value)
83 xmlChar *dvalue;
85 if (value == NULL)
86 return;
88 if (name == NULL)
89 return;
91 dvalue = xmlCharStrdup (value);
92 g_strstrip ((char *)dvalue);
94 if (!strcmp (name, "title")) {
95 ctx->channel_data->title = dvalue;
96 } else if (!strcmp (name, "language")) {
97 ctx->channel_data->lang = dvalue;
98 } else if (!strcmp (name, "itunes:subtitle")) {
99 ctx->channel_data->subtitle = dvalue;
100 } else if (!strcmp (name, "itunes:summary")) {
101 ctx->channel_data->summary = dvalue;
102 } else if (!strcmp (name, "description")) {
103 ctx->channel_data->description = dvalue;
104 } else if (!strcmp (name, "generator")) {
105 if (ctx->channel_data->author == NULL)
106 ctx->channel_data->author = dvalue;
107 } else if (!strcmp (name, "itunes:author")) {
108 g_free (ctx->channel_data->author);
109 ctx->channel_data->author = dvalue;
110 } else if (!strcmp (name, "webMaster")) {
111 ctx->channel_data->contact = dvalue;
112 } else if (!strcmp (name, "pubDate")) {
113 ctx->channel_data->pub_date = rb_podcast_parse_date ((char *)dvalue);
114 g_free (dvalue);
115 } else if (!strcmp (name, "copyright")) {
116 ctx->channel_data->copyright = dvalue;
117 } else if (!strcmp (name, "img")) {
118 ctx->channel_data->img = dvalue;
119 } else {
120 g_free (dvalue);
124 static void
125 rb_set_item_value (struct RBPoadcastLoadContext *ctx,
126 const char *name,
127 const char *value)
129 xmlChar *dvalue;
131 dvalue = xmlCharStrdup (value);
132 g_strstrip ((char *)dvalue);
134 if (!strcmp (name, "title")) {
135 ctx->item_data->title = dvalue;
136 } else if (!strcmp (name, "url")) {
137 ctx->item_data->url = dvalue;
138 } else if (!strcmp (name, "pubDate")) {
139 ctx->item_data->pub_date = rb_podcast_parse_date ((char *)dvalue);
140 g_free (dvalue);
141 } else if (!strcmp (name, "description")) {
142 ctx->item_data->description = dvalue;
143 } else if (!strcmp (name, "author")) {
144 ctx->item_data->author = dvalue;
145 } else if (!strcmp (name, "itunes:duration")) {
146 ctx->item_data->duration = rb_podcast_parse_time ((char *)dvalue);
147 g_free (dvalue);
148 } else if (!strcmp (name, "length")) {
149 ctx->item_data->filesize = g_ascii_strtoull ((char *)dvalue, NULL, 10);
150 } else {
151 g_free (dvalue);
155 static void
156 rb_insert_item (struct RBPoadcastLoadContext *ctx)
158 RBPodcastItem *data = ctx->item_data;
160 rb_debug ("Inserting item as post");
162 if (!data->url) {
163 rb_debug ("Item does not have a URL, skipping");
164 return;
167 ctx->channel_data->posts = g_list_prepend (ctx->channel_data->posts, ctx->item_data);
170 static gboolean
171 rb_validate_channel_propert (const char *name)
173 if (name == NULL) {
174 return FALSE;
177 if (!strcmp (name, "title") ||
178 !strcmp (name, "language") ||
179 !strcmp (name, "itunes:subtitle") ||
180 !strcmp (name, "itunes:summary") ||
181 !strcmp (name, "description") ||
182 !strcmp (name, "generator") ||
183 !strcmp (name, "itunes:author") ||
184 !strcmp (name, "webMaster") ||
185 !strcmp (name, "lastBuildDate") ||
186 !strcmp (name, "pubDate") ||
187 !strcmp (name, "copyright")) {
188 return TRUE;
189 } else {
190 return FALSE;
195 static gboolean
196 rb_validate_item_propert (const char *name)
198 if (name == NULL) {
199 return FALSE;
202 if (!strcmp (name, "title") ||
203 !strcmp (name, "url") ||
204 !strcmp (name, "pubDate") ||
205 !strcmp (name, "description") ||
206 !strcmp (name, "author") ||
207 !strcmp (name, "itunes:duration") ) {
209 return TRUE;
210 } else {
211 return FALSE;
215 static void
216 rb_podcast_parser_start_element (struct RBPoadcastLoadContext *ctx,
217 const char *name,
218 const char **attrs)
221 rb_debug ("Start element: %s state: %d", name, ctx->state);
223 switch (ctx->state) {
224 case RB_PODCAST_PARSER_STATE_START:
226 if (!strcmp (name, "rss")) {
227 ctx->state = RB_PODCAST_PARSER_STATE_RSS;
228 } else {
229 ctx->in_unknown_elt++;
232 break;
235 case RB_PODCAST_PARSER_STATE_RSS:
237 if (!strcmp (name, "channel")) {
238 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
239 } else {
240 ctx->in_unknown_elt++;
243 break;
246 case RB_PODCAST_PARSER_STATE_CHANNEL:
249 if (strcmp (name, "image") == 0
250 || strcmp (name, "itunes:image") == 0) {
251 ctx->state = RB_PODCAST_PARSER_STATE_IMG;
252 } else if (!strcmp (name, "item")) {
253 ctx->item_data = rb_podcast_initializa_item ();
254 ctx->state = RB_PODCAST_PARSER_STATE_ITEM;
255 } else if (!rb_validate_channel_propert (name)) {
256 rb_debug ("Unknown property");
257 ctx->in_unknown_elt++;
258 } else {
259 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY;
262 break;
265 case RB_PODCAST_PARSER_STATE_ITEM:
267 if (!strcmp (name, "enclosure")) {
268 for (; *attrs; attrs +=2) {
269 if (!strcmp (*attrs, "url")) {
270 const char *url_value = *(attrs + 1);
271 rb_set_item_value (ctx, "url", url_value);
272 } else if (!strcmp (*attrs, "length")) {
273 const char *length_value = *(attrs + 1);
274 rb_set_item_value (ctx, "length", length_value);
278 ctx->state = RB_PODCAST_PARSER_STATE_ITEM_PROPERTY;
280 } else if (!rb_validate_item_propert (name)) {
281 ctx->in_unknown_elt++;
282 } else {
283 ctx->state = RB_PODCAST_PARSER_STATE_ITEM_PROPERTY;
286 break;
289 case RB_PODCAST_PARSER_STATE_IMG:
291 if (strcmp (name, "url") != 0) {
292 ctx->in_unknown_elt++;
293 } else {
294 ctx->state = RB_PODCAST_PARSER_STATE_IMG_PROPERTY;
297 break;
300 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
301 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
302 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
303 rb_debug ("nested element inside property; treating as unknown");
304 ctx->in_unknown_elt++;
305 break;
307 case RB_PODCAST_PARSER_STATE_END:
308 break;
309 default:
310 g_warning ("Unknown podcast parser state: %d", ctx->state);
311 break;
315 static void
316 rb_podcast_parser_end_element (struct RBPoadcastLoadContext *ctx,
317 const char *name)
319 rb_debug ("End element: %s state: %d", name, ctx->state);
321 if (ctx->in_unknown_elt > 0) {
322 ctx->in_unknown_elt--;
323 rb_debug ("Unknown element");
324 return;
327 switch (ctx->state) {
328 case RB_PODCAST_PARSER_STATE_START:
329 ctx->state = RB_PODCAST_PARSER_STATE_END;
330 break;
332 case RB_PODCAST_PARSER_STATE_RSS:
333 ctx->state = RB_PODCAST_PARSER_STATE_START;
334 break;
336 case RB_PODCAST_PARSER_STATE_CHANNEL:
337 ctx->state = RB_PODCAST_PARSER_STATE_RSS;
338 break;
340 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
342 rb_set_channel_value (ctx, name, ctx->prop_value->str);
343 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
344 g_string_truncate (ctx->prop_value, 0);
345 break;
348 case RB_PODCAST_PARSER_STATE_ITEM:
350 rb_insert_item (ctx);
351 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
352 break;
355 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
357 rb_set_item_value (ctx, name, ctx->prop_value->str);
358 ctx->state = RB_PODCAST_PARSER_STATE_ITEM;
359 g_string_truncate (ctx->prop_value, 0);
360 break;
363 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
365 rb_set_channel_value (ctx, "img", ctx->prop_value->str);
366 ctx->state = RB_PODCAST_PARSER_STATE_IMG;
367 g_string_truncate (ctx->prop_value, 0);
368 break;
371 case RB_PODCAST_PARSER_STATE_IMG:
372 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
373 break;
375 case RB_PODCAST_PARSER_STATE_END:
376 break;
378 default:
379 g_warning ("Unknown podcast parser state: %d", ctx->state);
380 break;
384 static void
385 rb_podcast_parser_characters (struct RBPoadcastLoadContext *ctx,
386 const char *data,
387 guint len)
389 switch (ctx->state) {
390 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
391 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
392 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
393 g_string_append_len (ctx->prop_value, data, len);
394 break;
395 case RB_PODCAST_PARSER_STATE_START:
396 case RB_PODCAST_PARSER_STATE_IMG:
397 case RB_PODCAST_PARSER_STATE_RSS:
398 case RB_PODCAST_PARSER_STATE_CHANNEL:
399 case RB_PODCAST_PARSER_STATE_ITEM:
400 case RB_PODCAST_PARSER_STATE_END:
401 break;
402 default:
403 g_warning ("Unknown podcast parser state: %d", ctx->state);
404 break;
408 gboolean
409 rb_podcast_parse_load_feed (RBPodcastChannel *data,
410 const char *file_name)
412 xmlParserCtxtPtr ctxt;
413 xmlSAXHandlerPtr sax_handler = NULL;
414 GnomeVFSResult result;
415 GnomeVFSFileInfo *info;
416 gint file_size;
417 gchar *buffer = NULL;
419 struct RBPoadcastLoadContext *ctx = NULL;
421 data->url = xmlCharStrdup (file_name);
423 if (!g_str_has_suffix (file_name, ".rss") && !g_str_has_suffix (file_name, ".xml")) {
424 gboolean invalid_mime_type;
426 info = gnome_vfs_file_info_new ();
428 result = gnome_vfs_get_file_info (file_name, info, GNOME_VFS_FILE_INFO_DEFAULT);
430 if (info != NULL
431 && info->mime_type != NULL
432 && strstr (info->mime_type, "html") == NULL
433 && strstr (info->mime_type, "xml") == NULL
434 && strstr (info->mime_type, "rss") == NULL) {
435 invalid_mime_type = TRUE;
436 } else {
437 invalid_mime_type = FALSE;
440 if ((result != GNOME_VFS_OK)) {
441 rb_debug ("Invalid mime-type in podcast feed %s", info->mime_type);
442 gnome_vfs_file_info_unref (info);
443 return TRUE;
446 if (invalid_mime_type) {
447 GtkWidget *dialog;
449 GDK_THREADS_ENTER ();
450 dialog = gtk_message_dialog_new (NULL, 0,
451 GTK_MESSAGE_QUESTION,
452 GTK_BUTTONS_YES_NO,
453 _("The URL '%s' does not appear to be a podcast feed. "
454 "It may be the wrong URL, or the feed may be broken. "
455 "Would you like Rhythmbox to attempt to use it anyway?"),
456 file_name);
458 if (gtk_dialog_run (GTK_DIALOG (dialog)) == GTK_RESPONSE_YES)
459 invalid_mime_type = FALSE;
461 gtk_widget_destroy (dialog);
462 GDK_THREADS_LEAVE ();
465 gnome_vfs_file_info_unref (info);
467 if (invalid_mime_type)
468 return FALSE;
471 /* first download file by gnome_vfs for use gnome network configuration */
472 result = gnome_vfs_read_entire_file (file_name, &file_size, &buffer);
473 if (result != GNOME_VFS_OK)
474 return TRUE;
476 /* initializing parse */
477 sax_handler = g_new0 (xmlSAXHandler, 1);
478 sax_handler->startElement = (startElementSAXFunc) rb_podcast_parser_start_element;
479 sax_handler->endElement = (endElementSAXFunc) rb_podcast_parser_end_element;
480 sax_handler->characters = (charactersSAXFunc) rb_podcast_parser_characters;
481 xmlSubstituteEntitiesDefault (1);
483 ctx = g_new0 (struct RBPoadcastLoadContext, 1);
484 ctx->in_unknown_elt = 0;
485 ctx->channel_data = data;
486 ctx->prop_value = g_string_sized_new (512);
488 ctxt = xmlCreateMemoryParserCtxt (buffer, file_size);
489 if (ctx == NULL) {
490 g_free (sax_handler);
491 g_free (buffer);
492 g_string_free (ctx->prop_value, TRUE);
493 g_free (ctx);
494 return FALSE;
497 ctx->xmlctx = ctxt;
498 ctxt->userData = ctx;
499 ctxt->sax = sax_handler;
500 xmlParseDocument (ctxt);
502 g_free (sax_handler);
503 ctxt->sax = NULL;
504 xmlFreeParserCtxt (ctxt);
506 g_free (buffer);
507 g_string_free (ctx->prop_value, TRUE);
508 g_free (ctx);
510 data->posts = g_list_reverse (data->posts);
511 return TRUE;
514 static uintmax_t
515 rb_podcast_parse_date (const char *date_str)
517 struct tm tm;
518 char *result;
520 /* RFC 2822 date format */
521 result = strptime (date_str, "%a, %d %b %Y %T", &tm);
523 /* same as above, but without comma */
524 if (result == NULL) {
525 memset (&tm, 0, sizeof (struct tm));
526 result = strptime (date_str, "%a %d %b %Y %T", &tm);
529 /* close-to-RFC 2822, but with extra 0 */
530 if (result == NULL) {
531 memset (&tm, 0, sizeof (struct tm));
532 result = strptime (date_str, "%a, %d %b %Y 0%T", &tm);
535 /* format without weekday */
536 if (result == NULL) {
537 memset (&tm, 0, sizeof (struct tm));
538 result = strptime (date_str, "%d %b %Y %T", &tm);
541 /* reversed day and long month */
542 if (result == NULL) {
543 memset (&tm, 0, sizeof (struct tm));
544 result = strptime (date_str, "%a, %B %d %Y %T", &tm);
547 /* ISO date like */
548 if (result == NULL) {
549 memset (&tm, 0, sizeof (struct tm));
550 result = strptime (date_str, "%Y-%m-%d %T", &tm);
553 /* ISO date like without timezone */
554 if (result == NULL) {
555 memset (&tm, 0, sizeof (struct tm));
556 result = strptime (date_str, "%Y-%m-%d", &tm);
559 /* Broken weekday short names */
560 if (result == NULL) {
561 char *tmp;
563 /* strip off the erroneous weekday */
564 tmp = strstr (date_str, ",");
565 if (tmp != NULL) {
566 tmp++;
567 memset (&tm, 0, sizeof (struct tm));
568 result = strptime (tmp, "%d %b %Y %T", &tm);
572 /* format with timezone offset from GMT */
573 if (result == NULL) {
574 memset (&tm, 0, sizeof (struct tm));
575 result = strptime (date_str, "%a %b %d %T %z %Y", &tm);
578 /* format with timezone name */
579 if (result == NULL) {
580 char *tmp;
582 memset (&tm, 0, sizeof (struct tm));
584 /* match first part of time string */
585 result = strptime (date_str, "%a %b %d %T ", &tm);
587 /* look for anything with a timezone name-like format
588 i.e. at least one all caps alphabetical character */
589 if (result != NULL) {
590 size_t n;
592 n = strspn(result, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
593 tmp = result+n;
595 /* make sure there was at least one character that matched */
596 if ((tmp != NULL) && n > 0)
597 /* remaining part must be the year */
598 result = strptime (tmp, "%Y", &tm);
599 else
600 result = NULL;
604 if (result == NULL) {
605 memset (&tm, 0, sizeof (struct tm));
606 rb_debug ("unable to convert date string %s", date_str);
609 return (uintmax_t) mktime (&tm);
612 static gulong
613 rb_podcast_parse_time (const char *time_str)
615 struct tm tm;
616 char *result;
618 memset (&tm, 0, sizeof (struct tm));
619 result = strptime (time_str, "%H:%M:%S", &tm);
620 if (result == NULL) {
621 memset (&tm, 0, sizeof (struct tm));
622 result = strptime (time_str, "%M:%S", &tm);
624 if (result == NULL) {
625 memset (&tm, 0, sizeof (struct tm));
626 rb_debug ("unable to convert duration string %s", time_str);
629 return ((tm.tm_hour * 60 + tm.tm_min) * 60 + tm.tm_sec);
632 void
633 rb_podcast_parse_channel_free (RBPodcastChannel *data)
635 g_return_if_fail (data != NULL);
637 g_list_foreach (data->posts, (GFunc) rb_podcast_parse_item_free, NULL);
638 g_list_free (data->posts);
639 data->posts = NULL;
641 g_free (data->url);
642 g_free (data->title);
643 g_free (data->lang);
644 g_free (data->subtitle);
645 g_free (data->summary);
646 g_free (data->description);
647 g_free (data->author);
648 g_free (data->contact);
649 g_free (data->img);
650 g_free (data->copyright);
652 g_free (data);
653 data = NULL;
656 void
657 rb_podcast_parse_item_free (RBPodcastItem *item)
659 g_return_if_fail (item != NULL);
661 g_free (item->title);
662 g_free (item->url);
663 g_free (item->description);
665 g_free (item);