2006-05-19 James Livingston <doclivingston@gmail.com>
[rhythmbox.git] / podcast / rb-podcast-parse.c
blob1950aea768a0473232db5ea91f513ae0b80513d5
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
3 * arch-tag: Implementation of podcast parse
5 * Copyright (C) 2005 Renato Araujo Oliveira Filho - INdT <renato.filho@indt.org.br>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
23 #define _XOPEN_SOURCE
24 #define __EXTENSIONS__ /* get strptime */
25 #include <string.h>
26 #include <time.h>
28 #include <libxml/entities.h>
29 #include <libxml/SAX.h>
30 #include <libxml/parserInternals.h>
31 #include <libgnomevfs/gnome-vfs.h>
32 #include <glib/gi18n.h>
33 #include <gtk/gtk.h>
35 #include "rb-debug.h"
36 #include "rb-podcast-parse.h"
38 #define BUFFER_SIZE 256
40 struct RBPoadcastLoadContext
42 guint in_unknown_elt;
43 xmlParserCtxtPtr xmlctx;
44 GString *prop_value;
45 RBPodcastChannel *channel_data;
46 RBPodcastItem *item_data;
48 enum {
49 RB_PODCAST_PARSER_STATE_START,
50 RB_PODCAST_PARSER_STATE_RSS,
51 RB_PODCAST_PARSER_STATE_CHANNEL,
52 RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY,
53 RB_PODCAST_PARSER_STATE_IMG,
54 RB_PODCAST_PARSER_STATE_IMG_PROPERTY,
55 RB_PODCAST_PARSER_STATE_ITEM,
56 RB_PODCAST_PARSER_STATE_ITEM_PROPERTY,
57 RB_PODCAST_PARSER_STATE_END,
58 } state;
61 static gboolean rb_validate_channel_propert (const char *name);
62 static gboolean rb_validate_item_propert (const char *name);
63 static uintmax_t rb_podcast_parse_date (const char* date_str);
64 static gulong rb_podcast_parse_time (const char *time_str);
65 static void rb_podcast_parser_start_element (struct RBPoadcastLoadContext* ctx, const char *name, const char **attrs);
66 static void rb_podcast_parser_end_element (struct RBPoadcastLoadContext* ctx, const char *name);
67 static void rb_podcast_parser_characters (struct RBPoadcastLoadContext* ctx, const char *data, guint len);
68 static void rb_set_channel_value (struct RBPoadcastLoadContext* ctx, const char* name, const char* value);
69 static void rb_set_item_value (struct RBPoadcastLoadContext* ctx, const char* name, const char* value);
71 static RBPodcastItem *
72 rb_podcast_initializa_item ()
74 RBPodcastItem *data = g_new0 (RBPodcastItem, 1);
75 return data;
78 static void
79 rb_set_channel_value (struct RBPoadcastLoadContext *ctx,
80 const char *name,
81 const char *value)
83 xmlChar *dvalue;
85 if (value == NULL)
86 return;
88 if (name == NULL)
89 return;
91 dvalue = xmlCharStrdup (value);
92 g_strstrip ((char *)dvalue);
94 if (!strcmp (name, "title")) {
95 ctx->channel_data->title = dvalue;
96 } else if (!strcmp (name, "language")) {
97 ctx->channel_data->lang = dvalue;
98 } else if (!strcmp (name, "itunes:subtitle")) {
99 ctx->channel_data->subtitle = dvalue;
100 } else if (!strcmp (name, "itunes:summary")) {
101 ctx->channel_data->summary = dvalue;
102 } else if (!strcmp (name, "description")) {
103 ctx->channel_data->description = dvalue;
104 } else if (!strcmp (name, "generator")) {
105 if (ctx->channel_data->author == NULL)
106 ctx->channel_data->author = dvalue;
107 } else if (!strcmp (name, "itunes:author")) {
108 g_free (ctx->channel_data->author);
109 ctx->channel_data->author = dvalue;
110 } else if (!strcmp (name, "webMaster")) {
111 ctx->channel_data->contact = dvalue;
112 } else if (!strcmp (name, "pubDate")) {
113 ctx->channel_data->pub_date = rb_podcast_parse_date ((char *)dvalue);
114 g_free (dvalue);
115 } else if (!strcmp (name, "copyright")) {
116 ctx->channel_data->copyright = dvalue;
117 } else if (!strcmp (name, "img")) {
118 ctx->channel_data->img = dvalue;
119 } else {
120 g_free (dvalue);
125 static void
126 rb_set_item_value (struct RBPoadcastLoadContext *ctx,
127 const char *name,
128 const char *value)
130 xmlChar *dvalue;
132 dvalue = xmlCharStrdup (value);
133 g_strstrip ((char *)dvalue);
135 if (!strcmp (name, "title")) {
136 ctx->item_data->title = dvalue;
137 } else if (!strcmp (name, "url")) {
138 ctx->item_data->url = dvalue;
139 } else if (!strcmp (name, "pubDate")) {
140 ctx->item_data->pub_date = rb_podcast_parse_date ((char *)dvalue);
141 g_free (dvalue);
142 } else if (!strcmp (name, "description")) {
143 ctx->item_data->description = dvalue;
144 } else if (!strcmp (name, "author")) {
145 ctx->item_data->author = dvalue;
146 } else if (!strcmp (name, "itunes:duration")) {
147 ctx->item_data->duration = rb_podcast_parse_time ((char *)dvalue);
148 g_free (dvalue);
149 } else if (!strcmp (name, "length")) {
150 ctx->item_data->filesize = g_ascii_strtoull ((char *)dvalue, NULL, 10);
151 } else {
152 g_free (dvalue);
157 static void
158 rb_insert_item (struct RBPoadcastLoadContext *ctx)
160 RBPodcastItem *data = ctx->item_data;
162 rb_debug ("Inserting item as post");
164 if (!data->url) {
165 rb_debug ("Item does not have a URL, skipping");
166 return;
169 ctx->channel_data->posts = g_list_prepend (ctx->channel_data->posts, ctx->item_data);
172 static gboolean
173 rb_validate_channel_propert (const char *name)
175 if (name == NULL) {
176 return FALSE;
179 if (!strcmp (name, "title") ||
180 !strcmp (name, "language") ||
181 !strcmp (name, "itunes:subtitle") ||
182 !strcmp (name, "itunes:summary") ||
183 !strcmp (name, "description") ||
184 !strcmp (name, "generator") ||
185 !strcmp (name, "itunes:author") ||
186 !strcmp (name, "webMaster") ||
187 !strcmp (name, "lastBuildDate") ||
188 !strcmp (name, "pubDate") ||
189 !strcmp (name, "copyright")) {
190 return TRUE;
191 } else {
192 return FALSE;
197 static gboolean
198 rb_validate_item_propert (const char *name)
200 if (name == NULL) {
201 return FALSE;
204 if (!strcmp (name, "title") ||
205 !strcmp (name, "url") ||
206 !strcmp (name, "pubDate") ||
207 !strcmp (name, "description") ||
208 !strcmp (name, "author") ||
209 !strcmp (name, "itunes:duration") ) {
211 return TRUE;
212 } else {
213 return FALSE;
218 static void
219 rb_podcast_parser_start_element (struct RBPoadcastLoadContext *ctx,
220 const char *name,
221 const char **attrs)
224 rb_debug ("Start element: %s state: %d", name, ctx->state);
226 switch (ctx->state) {
227 case RB_PODCAST_PARSER_STATE_START:
229 if (!strcmp (name, "rss")) {
230 ctx->state = RB_PODCAST_PARSER_STATE_RSS;
231 } else {
232 ctx->in_unknown_elt++;
235 break;
238 case RB_PODCAST_PARSER_STATE_RSS:
240 if (!strcmp (name, "channel")) {
241 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
242 } else {
243 ctx->in_unknown_elt++;
246 break;
249 case RB_PODCAST_PARSER_STATE_CHANNEL:
252 if (strcmp (name, "image") == 0
253 || strcmp (name, "itunes:image") == 0) {
254 ctx->state = RB_PODCAST_PARSER_STATE_IMG;
255 } else if (!strcmp (name, "item")) {
256 ctx->item_data = rb_podcast_initializa_item ();
257 ctx->state = RB_PODCAST_PARSER_STATE_ITEM;
258 } else if (!rb_validate_channel_propert (name)) {
259 rb_debug ("Unknown property");
260 ctx->in_unknown_elt++;
261 } else {
262 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY;
265 break;
268 case RB_PODCAST_PARSER_STATE_ITEM:
270 if (!strcmp (name, "enclosure")) {
271 for (; *attrs; attrs +=2) {
272 if (!strcmp (*attrs, "url")) {
273 const char *url_value = *(attrs + 1);
274 rb_set_item_value (ctx, "url", url_value);
275 } else if (!strcmp (*attrs, "length")) {
276 const char *length_value = *(attrs + 1);
277 rb_set_item_value (ctx, "length", length_value);
281 ctx->state = RB_PODCAST_PARSER_STATE_ITEM_PROPERTY;
283 } else if (!rb_validate_item_propert (name)) {
284 ctx->in_unknown_elt++;
285 } else {
286 ctx->state = RB_PODCAST_PARSER_STATE_ITEM_PROPERTY;
289 break;
292 case RB_PODCAST_PARSER_STATE_IMG:
294 if (strcmp (name, "url") != 0) {
295 ctx->in_unknown_elt++;
296 } else {
297 ctx->state = RB_PODCAST_PARSER_STATE_IMG_PROPERTY;
300 break;
303 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
304 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
305 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
306 case RB_PODCAST_PARSER_STATE_END:
307 break;
308 default:
309 g_warning ("Unknown podcast parser state: %d", ctx->state);
310 break;
315 static void
316 rb_podcast_parser_end_element (struct RBPoadcastLoadContext *ctx,
317 const char *name)
319 rb_debug ("End element: %s state: %d", name, ctx->state);
321 if (ctx->in_unknown_elt > 0) {
322 ctx->in_unknown_elt--;
323 rb_debug ("Unknown element");
324 return;
327 switch (ctx->state) {
328 case RB_PODCAST_PARSER_STATE_START:
329 ctx->state = RB_PODCAST_PARSER_STATE_END;
330 break;
332 case RB_PODCAST_PARSER_STATE_RSS:
333 ctx->state = RB_PODCAST_PARSER_STATE_START;
334 break;
336 case RB_PODCAST_PARSER_STATE_CHANNEL:
337 ctx->state = RB_PODCAST_PARSER_STATE_RSS;
338 break;
340 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
342 rb_set_channel_value (ctx, name, ctx->prop_value->str);
343 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
344 g_string_truncate (ctx->prop_value, 0);
345 break;
348 case RB_PODCAST_PARSER_STATE_ITEM:
350 rb_insert_item (ctx);
351 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
352 break;
355 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
357 rb_set_item_value (ctx, name, ctx->prop_value->str);
358 ctx->state = RB_PODCAST_PARSER_STATE_ITEM;
359 g_string_truncate (ctx->prop_value, 0);
360 break;
363 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
365 rb_set_channel_value (ctx, "img", ctx->prop_value->str);
366 ctx->state = RB_PODCAST_PARSER_STATE_IMG;
367 g_string_truncate (ctx->prop_value, 0);
368 break;
371 case RB_PODCAST_PARSER_STATE_IMG:
372 ctx->state = RB_PODCAST_PARSER_STATE_CHANNEL;
373 break;
375 case RB_PODCAST_PARSER_STATE_END:
376 break;
378 default:
379 g_warning ("Unknown podcast parser state: %d", ctx->state);
380 break;
385 static void
386 rb_podcast_parser_characters (struct RBPoadcastLoadContext *ctx,
387 const char *data,
388 guint len)
390 switch (ctx->state) {
391 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY:
392 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY:
393 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY:
394 g_string_append_len (ctx->prop_value, data, len);
395 break;
396 case RB_PODCAST_PARSER_STATE_START:
397 case RB_PODCAST_PARSER_STATE_IMG:
398 case RB_PODCAST_PARSER_STATE_RSS:
399 case RB_PODCAST_PARSER_STATE_CHANNEL:
400 case RB_PODCAST_PARSER_STATE_ITEM:
401 case RB_PODCAST_PARSER_STATE_END:
402 break;
403 default:
404 g_warning ("Unknown podcast parser state: %d", ctx->state);
405 break;
410 gboolean
411 rb_podcast_parse_load_feed (RBPodcastChannel *data,
412 const char *file_name)
414 xmlParserCtxtPtr ctxt;
415 xmlSAXHandlerPtr sax_handler = NULL;
416 GnomeVFSResult result;
417 GnomeVFSFileInfo *info;
418 gint file_size;
419 gchar *buffer = NULL;
421 struct RBPoadcastLoadContext *ctx = NULL;
423 data->url = xmlCharStrdup (file_name);
425 if (!g_str_has_suffix (file_name, ".rss") && !g_str_has_suffix (file_name, ".xml")) {
426 gboolean invalid_mime_type;
428 info = gnome_vfs_file_info_new ();
430 result = gnome_vfs_get_file_info (file_name, info, GNOME_VFS_FILE_INFO_DEFAULT);
432 if (info != NULL
433 && info->mime_type != NULL
434 && strstr (info->mime_type, "xml") == NULL
435 && strstr (info->mime_type, "rss") == NULL) {
436 invalid_mime_type = TRUE;
437 } else {
438 invalid_mime_type = FALSE;
441 if ((result != GNOME_VFS_OK)) {
442 rb_debug ("Invalid mime-type in podcast feed %s", info->mime_type);
443 gnome_vfs_file_info_unref (info);
444 return TRUE;
447 if (invalid_mime_type) {
448 GtkWidget *dialog;
450 GDK_THREADS_ENTER ();
451 dialog = gtk_message_dialog_new (NULL, 0,
452 GTK_MESSAGE_QUESTION,
453 GTK_BUTTONS_YES_NO,
454 _("The URL '%s' does not appear to be a podcast feed. "
455 "It may be the wrong URL, or the feed may be broken. "
456 "Would you like Rhythmbox to attempt to use it anyway?"),
457 file_name);
459 if (gtk_dialog_run (GTK_DIALOG (dialog)) == GTK_RESPONSE_YES)
460 invalid_mime_type = FALSE;
462 gtk_widget_destroy (dialog);
463 GDK_THREADS_LEAVE ();
466 gnome_vfs_file_info_unref (info);
468 if (invalid_mime_type)
469 return FALSE;
472 /* first download file by gnome_vfs for use gnome network configuration */
473 result = gnome_vfs_read_entire_file (file_name, &file_size, &buffer);
474 if (result != GNOME_VFS_OK)
475 return TRUE;
478 /* initializing parse */
479 sax_handler = g_new0 (xmlSAXHandler, 1);
480 sax_handler->startElement = (startElementSAXFunc) rb_podcast_parser_start_element;
481 sax_handler->endElement = (endElementSAXFunc) rb_podcast_parser_end_element;
482 sax_handler->characters = (charactersSAXFunc) rb_podcast_parser_characters;
483 xmlSubstituteEntitiesDefault (1);
485 ctx = g_new0 (struct RBPoadcastLoadContext, 1);
486 ctx->in_unknown_elt = 0;
487 ctx->channel_data = data;
488 ctx->prop_value = g_string_sized_new (512);
490 ctxt = xmlCreateMemoryParserCtxt (buffer, file_size);
491 if (ctx == NULL) {
492 g_free (sax_handler);
493 g_free (buffer);
494 g_string_free (ctx->prop_value, TRUE);
495 g_free (ctx);
496 return FALSE;
499 ctx->xmlctx = ctxt;
500 ctxt->userData = ctx;
501 ctxt->sax = sax_handler;
502 xmlParseDocument (ctxt);
504 g_free (sax_handler);
505 ctxt->sax = NULL;
506 xmlFreeParserCtxt (ctxt);
508 g_free (buffer);
509 g_string_free (ctx->prop_value, TRUE);
510 g_free (ctx);
512 data->posts = g_list_reverse (data->posts);
513 return TRUE;
516 static uintmax_t
517 rb_podcast_parse_date (const char *date_str)
519 struct tm tm;
520 char *result;
522 /* RFC 2822 date format */
523 result = strptime (date_str, "%a, %d %b %Y %T", &tm);
525 /* same as above, but without comma */
526 if (result == NULL) {
527 memset (&tm, 0, sizeof (struct tm));
528 result = strptime (date_str, "%a %d %b %Y %T", &tm);
531 /* close-to-RFC 2822, but with extra 0 */
532 if (result == NULL) {
533 memset (&tm, 0, sizeof (struct tm));
534 result = strptime (date_str, "%a, %d %b %Y 0%T", &tm);
537 /* format without weekday */
538 if (result == NULL) {
539 memset (&tm, 0, sizeof (struct tm));
540 result = strptime (date_str, "%d %b %Y %T", &tm);
543 /* reversed day and long month */
544 if (result == NULL) {
545 memset (&tm, 0, sizeof (struct tm));
546 result = strptime (date_str, "%a, %B %d %Y %T", &tm);
549 /* ISO date like */
550 if (result == NULL) {
551 memset (&tm, 0, sizeof (struct tm));
552 result = strptime (date_str, "%Y-%m-%d %T", &tm);
555 /* ISO date like without timezone */
556 if (result == NULL) {
557 memset (&tm, 0, sizeof (struct tm));
558 result = strptime (date_str, "%Y-%m-%d", &tm);
561 /* Broken weekday short names */
562 if (result == NULL) {
563 char *tmp;
565 /* strip off the erroneous weekday */
566 tmp = strstr (date_str, ",");
567 if (tmp != NULL) {
568 tmp++;
569 memset (&tm, 0, sizeof (struct tm));
570 result = strptime (tmp, "%d %b %Y %T", &tm);
574 /* format with timezone offset from GMT */
575 if (result == NULL) {
576 memset (&tm, 0, sizeof (struct tm));
577 result = strptime (date_str, "%a %b %d %T %z %Y", &tm);
580 /* format with timezone name */
581 if (result == NULL) {
582 char *tmp;
584 memset (&tm, 0, sizeof (struct tm));
586 /* match first part of time string */
587 result = strptime (date_str, "%a %b %d %T ", &tm);
589 /* look for anything with a timezone name-like format
590 i.e. at least one all caps alphabetical character */
591 if (result != NULL) {
592 size_t n;
594 n = strspn(result, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
595 tmp = result+n;
597 /* make sure there was at least one character that matched */
598 if ((tmp != NULL) && n > 0)
599 /* remaining part must be the year */
600 result = strptime (tmp, "%Y", &tm);
601 else
602 result = NULL;
606 if (result == NULL) {
607 memset (&tm, 0, sizeof (struct tm));
608 rb_debug ("unable to convert date string %s", date_str);
611 return (uintmax_t) mktime (&tm);
614 static gulong
615 rb_podcast_parse_time (const char *time_str)
617 struct tm tm;
618 char *result;
620 memset (&tm, 0, sizeof (struct tm));
621 result = strptime (time_str, "%H:%M:%S", &tm);
622 if (result == NULL) {
623 memset (&tm, 0, sizeof (struct tm));
624 result = strptime (time_str, "%M:%S", &tm);
626 if (result == NULL) {
627 memset (&tm, 0, sizeof (struct tm));
628 rb_debug ("unable to convert duration string %s", time_str);
631 return ((tm.tm_hour * 60 + tm.tm_min) * 60 + tm.tm_sec);
634 void
635 rb_podcast_parse_channel_free (RBPodcastChannel *data)
637 g_return_if_fail (data != NULL);
639 g_list_foreach (data->posts, (GFunc) rb_podcast_parse_item_free, NULL);
640 g_list_free (data->posts);
641 data->posts = NULL;
643 g_free (data->url);
644 g_free (data->title);
645 g_free (data->lang);
646 g_free (data->subtitle);
647 g_free (data->summary);
648 g_free (data->description);
649 g_free (data->author);
650 g_free (data->contact);
651 g_free (data->img);
652 g_free (data->copyright);
654 g_free (data);
655 data = NULL;
658 void
659 rb_podcast_parse_item_free (RBPodcastItem *item)
661 g_return_if_fail (item != NULL);
663 g_free (item->title);
664 g_free (item->url);
665 g_free (item->description);
667 g_free (item);