1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
3 * arch-tag: Implementation of podcast parse
5 * Copyright (C) 2005 Renato Araujo Oliveira Filho - INdT <renato.filho@indt.org.br>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24 #define __EXTENSIONS__ /* get strptime */
28 #include <libxml/entities.h>
29 #include <libxml/SAX.h>
30 #include <libxml/parserInternals.h>
31 #include <libgnomevfs/gnome-vfs.h>
32 #include <glib/gi18n.h>
36 #include "rb-podcast-parse.h"
38 #define BUFFER_SIZE 256
40 struct RBPoadcastLoadContext
43 xmlParserCtxtPtr xmlctx
;
45 RBPodcastChannel
*channel_data
;
46 RBPodcastItem
*item_data
;
49 RB_PODCAST_PARSER_STATE_START
,
50 RB_PODCAST_PARSER_STATE_RSS
,
51 RB_PODCAST_PARSER_STATE_CHANNEL
,
52 RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
,
53 RB_PODCAST_PARSER_STATE_IMG
,
54 RB_PODCAST_PARSER_STATE_IMG_PROPERTY
,
55 RB_PODCAST_PARSER_STATE_ITEM
,
56 RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
,
57 RB_PODCAST_PARSER_STATE_END
,
61 static gboolean
rb_validate_channel_propert (const char *name
);
62 static gboolean
rb_validate_item_propert (const char *name
);
63 static uintmax_t rb_podcast_parse_date (const char* date_str
);
64 static gulong
rb_podcast_parse_time (const char *time_str
);
65 static void rb_podcast_parser_start_element (struct RBPoadcastLoadContext
* ctx
, const char *name
, const char **attrs
);
66 static void rb_podcast_parser_end_element (struct RBPoadcastLoadContext
* ctx
, const char *name
);
67 static void rb_podcast_parser_characters (struct RBPoadcastLoadContext
* ctx
, const char *data
, guint len
);
68 static void rb_set_channel_value (struct RBPoadcastLoadContext
* ctx
, const char* name
, const char* value
);
69 static void rb_set_item_value (struct RBPoadcastLoadContext
* ctx
, const char* name
, const char* value
);
71 static RBPodcastItem
*
72 rb_podcast_initializa_item ()
74 RBPodcastItem
*data
= g_new0 (RBPodcastItem
, 1);
79 rb_set_channel_value (struct RBPoadcastLoadContext
*ctx
,
91 dvalue
= xmlCharStrdup (value
);
92 g_strstrip ((char *)dvalue
);
94 if (!strcmp (name
, "title")) {
95 ctx
->channel_data
->title
= dvalue
;
96 } else if (!strcmp (name
, "language")) {
97 ctx
->channel_data
->lang
= dvalue
;
98 } else if (!strcmp (name
, "itunes:subtitle")) {
99 ctx
->channel_data
->subtitle
= dvalue
;
100 } else if (!strcmp (name
, "itunes:summary")) {
101 ctx
->channel_data
->summary
= dvalue
;
102 } else if (!strcmp (name
, "description")) {
103 ctx
->channel_data
->description
= dvalue
;
104 } else if (!strcmp (name
, "generator")) {
105 if (ctx
->channel_data
->author
== NULL
)
106 ctx
->channel_data
->author
= dvalue
;
107 } else if (!strcmp (name
, "itunes:author")) {
108 g_free (ctx
->channel_data
->author
);
109 ctx
->channel_data
->author
= dvalue
;
110 } else if (!strcmp (name
, "webMaster")) {
111 ctx
->channel_data
->contact
= dvalue
;
112 } else if (!strcmp (name
, "pubDate")) {
113 ctx
->channel_data
->pub_date
= rb_podcast_parse_date ((char *)dvalue
);
115 } else if (!strcmp (name
, "copyright")) {
116 ctx
->channel_data
->copyright
= dvalue
;
117 } else if (!strcmp (name
, "img")) {
118 ctx
->channel_data
->img
= dvalue
;
125 rb_set_item_value (struct RBPoadcastLoadContext
*ctx
,
131 dvalue
= xmlCharStrdup (value
);
132 g_strstrip ((char *)dvalue
);
134 if (!strcmp (name
, "title")) {
135 ctx
->item_data
->title
= dvalue
;
136 } else if (!strcmp (name
, "url")) {
137 ctx
->item_data
->url
= dvalue
;
138 } else if (!strcmp (name
, "pubDate")) {
139 ctx
->item_data
->pub_date
= rb_podcast_parse_date ((char *)dvalue
);
141 } else if (!strcmp (name
, "description")) {
142 ctx
->item_data
->description
= dvalue
;
143 } else if (!strcmp (name
, "author")) {
144 ctx
->item_data
->author
= dvalue
;
145 } else if (!strcmp (name
, "itunes:duration")) {
146 ctx
->item_data
->duration
= rb_podcast_parse_time ((char *)dvalue
);
148 } else if (!strcmp (name
, "length")) {
149 ctx
->item_data
->filesize
= g_ascii_strtoull ((char *)dvalue
, NULL
, 10);
156 rb_insert_item (struct RBPoadcastLoadContext
*ctx
)
158 RBPodcastItem
*data
= ctx
->item_data
;
160 rb_debug ("Inserting item as post");
163 rb_debug ("Item does not have a URL, skipping");
167 ctx
->channel_data
->posts
= g_list_prepend (ctx
->channel_data
->posts
, ctx
->item_data
);
171 rb_validate_channel_propert (const char *name
)
177 if (!strcmp (name
, "title") ||
178 !strcmp (name
, "language") ||
179 !strcmp (name
, "itunes:subtitle") ||
180 !strcmp (name
, "itunes:summary") ||
181 !strcmp (name
, "description") ||
182 !strcmp (name
, "generator") ||
183 !strcmp (name
, "itunes:author") ||
184 !strcmp (name
, "webMaster") ||
185 !strcmp (name
, "lastBuildDate") ||
186 !strcmp (name
, "pubDate") ||
187 !strcmp (name
, "copyright")) {
196 rb_validate_item_propert (const char *name
)
202 if (!strcmp (name
, "title") ||
203 !strcmp (name
, "url") ||
204 !strcmp (name
, "pubDate") ||
205 !strcmp (name
, "description") ||
206 !strcmp (name
, "author") ||
207 !strcmp (name
, "itunes:duration") ) {
216 rb_podcast_parser_start_element (struct RBPoadcastLoadContext
*ctx
,
221 rb_debug ("Start element: %s state: %d", name
, ctx
->state
);
223 switch (ctx
->state
) {
224 case RB_PODCAST_PARSER_STATE_START
:
226 if (!strcmp (name
, "rss")) {
227 ctx
->state
= RB_PODCAST_PARSER_STATE_RSS
;
229 ctx
->in_unknown_elt
++;
235 case RB_PODCAST_PARSER_STATE_RSS
:
237 if (!strcmp (name
, "channel")) {
238 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
240 ctx
->in_unknown_elt
++;
246 case RB_PODCAST_PARSER_STATE_CHANNEL
:
249 if (strcmp (name
, "image") == 0
250 || strcmp (name
, "itunes:image") == 0) {
251 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG
;
252 } else if (!strcmp (name
, "item")) {
253 ctx
->item_data
= rb_podcast_initializa_item ();
254 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM
;
255 } else if (!rb_validate_channel_propert (name
)) {
256 rb_debug ("Unknown property");
257 ctx
->in_unknown_elt
++;
259 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
;
265 case RB_PODCAST_PARSER_STATE_ITEM
:
267 if (!strcmp (name
, "enclosure")) {
268 for (; *attrs
; attrs
+=2) {
269 if (!strcmp (*attrs
, "url")) {
270 const char *url_value
= *(attrs
+ 1);
271 rb_set_item_value (ctx
, "url", url_value
);
272 } else if (!strcmp (*attrs
, "length")) {
273 const char *length_value
= *(attrs
+ 1);
274 rb_set_item_value (ctx
, "length", length_value
);
278 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
;
280 } else if (!rb_validate_item_propert (name
)) {
281 ctx
->in_unknown_elt
++;
283 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
;
289 case RB_PODCAST_PARSER_STATE_IMG
:
291 if (strcmp (name
, "url") != 0) {
292 ctx
->in_unknown_elt
++;
294 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG_PROPERTY
;
300 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
301 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
302 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
303 rb_debug ("nested element inside property; treating as unknown");
304 ctx
->in_unknown_elt
++;
307 case RB_PODCAST_PARSER_STATE_END
:
310 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
316 rb_podcast_parser_end_element (struct RBPoadcastLoadContext
*ctx
,
319 rb_debug ("End element: %s state: %d", name
, ctx
->state
);
321 if (ctx
->in_unknown_elt
> 0) {
322 ctx
->in_unknown_elt
--;
323 rb_debug ("Unknown element");
327 switch (ctx
->state
) {
328 case RB_PODCAST_PARSER_STATE_START
:
329 ctx
->state
= RB_PODCAST_PARSER_STATE_END
;
332 case RB_PODCAST_PARSER_STATE_RSS
:
333 ctx
->state
= RB_PODCAST_PARSER_STATE_START
;
336 case RB_PODCAST_PARSER_STATE_CHANNEL
:
337 ctx
->state
= RB_PODCAST_PARSER_STATE_RSS
;
340 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
342 rb_set_channel_value (ctx
, name
, ctx
->prop_value
->str
);
343 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
344 g_string_truncate (ctx
->prop_value
, 0);
348 case RB_PODCAST_PARSER_STATE_ITEM
:
350 rb_insert_item (ctx
);
351 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
355 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
357 rb_set_item_value (ctx
, name
, ctx
->prop_value
->str
);
358 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM
;
359 g_string_truncate (ctx
->prop_value
, 0);
363 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
365 rb_set_channel_value (ctx
, "img", ctx
->prop_value
->str
);
366 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG
;
367 g_string_truncate (ctx
->prop_value
, 0);
371 case RB_PODCAST_PARSER_STATE_IMG
:
372 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
375 case RB_PODCAST_PARSER_STATE_END
:
379 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
385 rb_podcast_parser_characters (struct RBPoadcastLoadContext
*ctx
,
389 switch (ctx
->state
) {
390 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
391 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
392 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
393 g_string_append_len (ctx
->prop_value
, data
, len
);
395 case RB_PODCAST_PARSER_STATE_START
:
396 case RB_PODCAST_PARSER_STATE_IMG
:
397 case RB_PODCAST_PARSER_STATE_RSS
:
398 case RB_PODCAST_PARSER_STATE_CHANNEL
:
399 case RB_PODCAST_PARSER_STATE_ITEM
:
400 case RB_PODCAST_PARSER_STATE_END
:
403 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
409 rb_podcast_parse_load_feed (RBPodcastChannel
*data
,
410 const char *file_name
)
412 xmlParserCtxtPtr ctxt
;
413 xmlSAXHandlerPtr sax_handler
= NULL
;
414 GnomeVFSResult result
;
415 GnomeVFSFileInfo
*info
;
417 gchar
*buffer
= NULL
;
419 struct RBPoadcastLoadContext
*ctx
= NULL
;
421 data
->url
= xmlCharStrdup (file_name
);
423 if (!g_str_has_suffix (file_name
, ".rss") && !g_str_has_suffix (file_name
, ".xml")) {
424 gboolean invalid_mime_type
;
426 info
= gnome_vfs_file_info_new ();
428 result
= gnome_vfs_get_file_info (file_name
, info
, GNOME_VFS_FILE_INFO_DEFAULT
);
431 && info
->mime_type
!= NULL
432 && strstr (info
->mime_type
, "html") == NULL
433 && strstr (info
->mime_type
, "xml") == NULL
434 && strstr (info
->mime_type
, "rss") == NULL
) {
435 invalid_mime_type
= TRUE
;
437 invalid_mime_type
= FALSE
;
440 if ((result
!= GNOME_VFS_OK
)) {
441 rb_debug ("Invalid mime-type in podcast feed %s", info
->mime_type
);
442 gnome_vfs_file_info_unref (info
);
446 if (invalid_mime_type
) {
449 GDK_THREADS_ENTER ();
450 dialog
= gtk_message_dialog_new (NULL
, 0,
451 GTK_MESSAGE_QUESTION
,
453 _("The URL '%s' does not appear to be a podcast feed. "
454 "It may be the wrong URL, or the feed may be broken. "
455 "Would you like Rhythmbox to attempt to use it anyway?"),
458 if (gtk_dialog_run (GTK_DIALOG (dialog
)) == GTK_RESPONSE_YES
)
459 invalid_mime_type
= FALSE
;
461 gtk_widget_destroy (dialog
);
462 GDK_THREADS_LEAVE ();
465 gnome_vfs_file_info_unref (info
);
467 if (invalid_mime_type
)
471 /* first download file by gnome_vfs for use gnome network configuration */
472 result
= gnome_vfs_read_entire_file (file_name
, &file_size
, &buffer
);
473 if (result
!= GNOME_VFS_OK
)
476 /* initializing parse */
477 sax_handler
= g_new0 (xmlSAXHandler
, 1);
478 sax_handler
->startElement
= (startElementSAXFunc
) rb_podcast_parser_start_element
;
479 sax_handler
->endElement
= (endElementSAXFunc
) rb_podcast_parser_end_element
;
480 sax_handler
->characters
= (charactersSAXFunc
) rb_podcast_parser_characters
;
481 xmlSubstituteEntitiesDefault (1);
483 ctx
= g_new0 (struct RBPoadcastLoadContext
, 1);
484 ctx
->in_unknown_elt
= 0;
485 ctx
->channel_data
= data
;
486 ctx
->prop_value
= g_string_sized_new (512);
488 ctxt
= xmlCreateMemoryParserCtxt (buffer
, file_size
);
490 g_free (sax_handler
);
492 g_string_free (ctx
->prop_value
, TRUE
);
498 ctxt
->userData
= ctx
;
499 ctxt
->sax
= sax_handler
;
500 xmlParseDocument (ctxt
);
502 g_free (sax_handler
);
504 xmlFreeParserCtxt (ctxt
);
507 g_string_free (ctx
->prop_value
, TRUE
);
510 data
->posts
= g_list_reverse (data
->posts
);
515 rb_podcast_parse_date (const char *date_str
)
520 /* RFC 2822 date format */
521 result
= strptime (date_str
, "%a, %d %b %Y %T", &tm
);
523 /* same as above, but without comma */
524 if (result
== NULL
) {
525 memset (&tm
, 0, sizeof (struct tm
));
526 result
= strptime (date_str
, "%a %d %b %Y %T", &tm
);
529 /* close-to-RFC 2822, but with extra 0 */
530 if (result
== NULL
) {
531 memset (&tm
, 0, sizeof (struct tm
));
532 result
= strptime (date_str
, "%a, %d %b %Y 0%T", &tm
);
535 /* format without weekday */
536 if (result
== NULL
) {
537 memset (&tm
, 0, sizeof (struct tm
));
538 result
= strptime (date_str
, "%d %b %Y %T", &tm
);
541 /* reversed day and long month */
542 if (result
== NULL
) {
543 memset (&tm
, 0, sizeof (struct tm
));
544 result
= strptime (date_str
, "%a, %B %d %Y %T", &tm
);
548 if (result
== NULL
) {
549 memset (&tm
, 0, sizeof (struct tm
));
550 result
= strptime (date_str
, "%Y-%m-%d %T", &tm
);
553 /* ISO date like without timezone */
554 if (result
== NULL
) {
555 memset (&tm
, 0, sizeof (struct tm
));
556 result
= strptime (date_str
, "%Y-%m-%d", &tm
);
559 /* Broken weekday short names */
560 if (result
== NULL
) {
563 /* strip off the erroneous weekday */
564 tmp
= strstr (date_str
, ",");
567 memset (&tm
, 0, sizeof (struct tm
));
568 result
= strptime (tmp
, "%d %b %Y %T", &tm
);
572 /* format with timezone offset from GMT */
573 if (result
== NULL
) {
574 memset (&tm
, 0, sizeof (struct tm
));
575 result
= strptime (date_str
, "%a %b %d %T %z %Y", &tm
);
578 /* format with timezone name */
579 if (result
== NULL
) {
582 memset (&tm
, 0, sizeof (struct tm
));
584 /* match first part of time string */
585 result
= strptime (date_str
, "%a %b %d %T ", &tm
);
587 /* look for anything with a timezone name-like format
588 i.e. at least one all caps alphabetical character */
589 if (result
!= NULL
) {
592 n
= strspn(result
, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
595 /* make sure there was at least one character that matched */
596 if ((tmp
!= NULL
) && n
> 0)
597 /* remaining part must be the year */
598 result
= strptime (tmp
, "%Y", &tm
);
604 if (result
== NULL
) {
605 memset (&tm
, 0, sizeof (struct tm
));
606 rb_debug ("unable to convert date string %s", date_str
);
609 return (uintmax_t) mktime (&tm
);
613 rb_podcast_parse_time (const char *time_str
)
618 memset (&tm
, 0, sizeof (struct tm
));
619 result
= strptime (time_str
, "%H:%M:%S", &tm
);
620 if (result
== NULL
) {
621 memset (&tm
, 0, sizeof (struct tm
));
622 result
= strptime (time_str
, "%M:%S", &tm
);
624 if (result
== NULL
) {
625 memset (&tm
, 0, sizeof (struct tm
));
626 rb_debug ("unable to convert duration string %s", time_str
);
629 return ((tm
.tm_hour
* 60 + tm
.tm_min
) * 60 + tm
.tm_sec
);
633 rb_podcast_parse_channel_free (RBPodcastChannel
*data
)
635 g_return_if_fail (data
!= NULL
);
637 g_list_foreach (data
->posts
, (GFunc
) rb_podcast_parse_item_free
, NULL
);
638 g_list_free (data
->posts
);
642 g_free (data
->title
);
644 g_free (data
->subtitle
);
645 g_free (data
->summary
);
646 g_free (data
->description
);
647 g_free (data
->author
);
648 g_free (data
->contact
);
650 g_free (data
->copyright
);
657 rb_podcast_parse_item_free (RBPodcastItem
*item
)
659 g_return_if_fail (item
!= NULL
);
661 g_free (item
->title
);
663 g_free (item
->description
);