1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
3 * arch-tag: Implementation of podcast parse
5 * Copyright (C) 2005 Renato Araujo Oliveira Filho - INdT <renato.filho@indt.org.br>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24 #define __EXTENSIONS__ /* get strptime */
28 #include <libxml/entities.h>
29 #include <libxml/SAX.h>
30 #include <libxml/parserInternals.h>
31 #include <libgnomevfs/gnome-vfs.h>
32 #include <glib/gi18n.h>
36 #include "rb-podcast-parse.h"
38 #define BUFFER_SIZE 256
40 struct RBPoadcastLoadContext
43 xmlParserCtxtPtr xmlctx
;
45 RBPodcastChannel
*channel_data
;
46 RBPodcastItem
*item_data
;
49 RB_PODCAST_PARSER_STATE_START
,
50 RB_PODCAST_PARSER_STATE_RSS
,
51 RB_PODCAST_PARSER_STATE_CHANNEL
,
52 RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
,
53 RB_PODCAST_PARSER_STATE_IMG
,
54 RB_PODCAST_PARSER_STATE_IMG_PROPERTY
,
55 RB_PODCAST_PARSER_STATE_ITEM
,
56 RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
,
57 RB_PODCAST_PARSER_STATE_END
,
61 static gboolean
rb_validate_channel_propert (const char *name
);
62 static gboolean
rb_validate_item_propert (const char *name
);
63 static uintmax_t rb_podcast_parse_date (const char* date_str
);
64 static gulong
rb_podcast_parse_time (const char *time_str
);
65 static void rb_podcast_parser_start_element (struct RBPoadcastLoadContext
* ctx
, const char *name
, const char **attrs
);
66 static void rb_podcast_parser_end_element (struct RBPoadcastLoadContext
* ctx
, const char *name
);
67 static void rb_podcast_parser_characters (struct RBPoadcastLoadContext
* ctx
, const char *data
, guint len
);
68 static void rb_set_channel_value (struct RBPoadcastLoadContext
* ctx
, const char* name
, const char* value
);
69 static void rb_set_item_value (struct RBPoadcastLoadContext
* ctx
, const char* name
, const char* value
);
71 static RBPodcastItem
*
72 rb_podcast_initializa_item ()
74 RBPodcastItem
*data
= g_new0 (RBPodcastItem
, 1);
79 rb_set_channel_value (struct RBPoadcastLoadContext
*ctx
,
91 dvalue
= xmlCharStrdup (value
);
92 g_strstrip ((char *)dvalue
);
94 if (!strcmp (name
, "title")) {
95 ctx
->channel_data
->title
= dvalue
;
96 } else if (!strcmp (name
, "language")) {
97 ctx
->channel_data
->lang
= dvalue
;
98 } else if (!strcmp (name
, "itunes:subtitle")) {
99 ctx
->channel_data
->subtitle
= dvalue
;
100 } else if (!strcmp (name
, "itunes:summary")) {
101 ctx
->channel_data
->summary
= dvalue
;
102 } else if (!strcmp (name
, "description")) {
103 ctx
->channel_data
->description
= dvalue
;
104 } else if (!strcmp (name
, "generator")) {
105 if (ctx
->channel_data
->author
== NULL
)
106 ctx
->channel_data
->author
= dvalue
;
107 } else if (!strcmp (name
, "itunes:author")) {
108 g_free (ctx
->channel_data
->author
);
109 ctx
->channel_data
->author
= dvalue
;
110 } else if (!strcmp (name
, "webMaster")) {
111 ctx
->channel_data
->contact
= dvalue
;
112 } else if (!strcmp (name
, "pubDate")) {
113 ctx
->channel_data
->pub_date
= rb_podcast_parse_date ((char *)dvalue
);
115 } else if (!strcmp (name
, "copyright")) {
116 ctx
->channel_data
->copyright
= dvalue
;
117 } else if (!strcmp (name
, "img")) {
118 ctx
->channel_data
->img
= dvalue
;
126 rb_set_item_value (struct RBPoadcastLoadContext
*ctx
,
132 dvalue
= xmlCharStrdup (value
);
133 g_strstrip ((char *)dvalue
);
135 if (!strcmp (name
, "title")) {
136 ctx
->item_data
->title
= dvalue
;
137 } else if (!strcmp (name
, "url")) {
138 ctx
->item_data
->url
= dvalue
;
139 } else if (!strcmp (name
, "pubDate")) {
140 ctx
->item_data
->pub_date
= rb_podcast_parse_date ((char *)dvalue
);
142 } else if (!strcmp (name
, "description")) {
143 ctx
->item_data
->description
= dvalue
;
144 } else if (!strcmp (name
, "author")) {
145 ctx
->item_data
->author
= dvalue
;
146 } else if (!strcmp (name
, "itunes:duration")) {
147 ctx
->item_data
->duration
= rb_podcast_parse_time ((char *)dvalue
);
149 } else if (!strcmp (name
, "length")) {
150 ctx
->item_data
->filesize
= g_ascii_strtoull ((char *)dvalue
, NULL
, 10);
158 rb_insert_item (struct RBPoadcastLoadContext
*ctx
)
160 RBPodcastItem
*data
= ctx
->item_data
;
162 rb_debug ("Inserting item as post");
165 rb_debug ("Item does not have a URL, skipping");
169 ctx
->channel_data
->posts
= g_list_prepend (ctx
->channel_data
->posts
, ctx
->item_data
);
173 rb_validate_channel_propert (const char *name
)
179 if (!strcmp (name
, "title") ||
180 !strcmp (name
, "language") ||
181 !strcmp (name
, "itunes:subtitle") ||
182 !strcmp (name
, "itunes:summary") ||
183 !strcmp (name
, "description") ||
184 !strcmp (name
, "generator") ||
185 !strcmp (name
, "itunes:author") ||
186 !strcmp (name
, "webMaster") ||
187 !strcmp (name
, "lastBuildDate") ||
188 !strcmp (name
, "pubDate") ||
189 !strcmp (name
, "copyright")) {
198 rb_validate_item_propert (const char *name
)
204 if (!strcmp (name
, "title") ||
205 !strcmp (name
, "url") ||
206 !strcmp (name
, "pubDate") ||
207 !strcmp (name
, "description") ||
208 !strcmp (name
, "author") ||
209 !strcmp (name
, "itunes:duration") ) {
219 rb_podcast_parser_start_element (struct RBPoadcastLoadContext
*ctx
,
224 rb_debug ("Start element: %s state: %d", name
, ctx
->state
);
226 switch (ctx
->state
) {
227 case RB_PODCAST_PARSER_STATE_START
:
229 if (!strcmp (name
, "rss")) {
230 ctx
->state
= RB_PODCAST_PARSER_STATE_RSS
;
232 ctx
->in_unknown_elt
++;
238 case RB_PODCAST_PARSER_STATE_RSS
:
240 if (!strcmp (name
, "channel")) {
241 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
243 ctx
->in_unknown_elt
++;
249 case RB_PODCAST_PARSER_STATE_CHANNEL
:
252 if (strcmp (name
, "image") == 0
253 || strcmp (name
, "itunes:image") == 0) {
254 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG
;
255 } else if (!strcmp (name
, "item")) {
256 ctx
->item_data
= rb_podcast_initializa_item ();
257 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM
;
258 } else if (!rb_validate_channel_propert (name
)) {
259 rb_debug ("Unknown property");
260 ctx
->in_unknown_elt
++;
262 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
;
268 case RB_PODCAST_PARSER_STATE_ITEM
:
270 if (!strcmp (name
, "enclosure")) {
271 for (; *attrs
; attrs
+=2) {
272 if (!strcmp (*attrs
, "url")) {
273 const char *url_value
= *(attrs
+ 1);
274 rb_set_item_value (ctx
, "url", url_value
);
275 } else if (!strcmp (*attrs
, "length")) {
276 const char *length_value
= *(attrs
+ 1);
277 rb_set_item_value (ctx
, "length", length_value
);
281 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
;
283 } else if (!rb_validate_item_propert (name
)) {
284 ctx
->in_unknown_elt
++;
286 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
;
292 case RB_PODCAST_PARSER_STATE_IMG
:
294 if (strcmp (name
, "url") != 0) {
295 ctx
->in_unknown_elt
++;
297 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG_PROPERTY
;
303 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
304 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
305 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
306 case RB_PODCAST_PARSER_STATE_END
:
309 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
316 rb_podcast_parser_end_element (struct RBPoadcastLoadContext
*ctx
,
319 rb_debug ("End element: %s state: %d", name
, ctx
->state
);
321 if (ctx
->in_unknown_elt
> 0) {
322 ctx
->in_unknown_elt
--;
323 rb_debug ("Unknown element");
327 switch (ctx
->state
) {
328 case RB_PODCAST_PARSER_STATE_START
:
329 ctx
->state
= RB_PODCAST_PARSER_STATE_END
;
332 case RB_PODCAST_PARSER_STATE_RSS
:
333 ctx
->state
= RB_PODCAST_PARSER_STATE_START
;
336 case RB_PODCAST_PARSER_STATE_CHANNEL
:
337 ctx
->state
= RB_PODCAST_PARSER_STATE_RSS
;
340 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
342 rb_set_channel_value (ctx
, name
, ctx
->prop_value
->str
);
343 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
344 g_string_truncate (ctx
->prop_value
, 0);
348 case RB_PODCAST_PARSER_STATE_ITEM
:
350 rb_insert_item (ctx
);
351 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
355 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
357 rb_set_item_value (ctx
, name
, ctx
->prop_value
->str
);
358 ctx
->state
= RB_PODCAST_PARSER_STATE_ITEM
;
359 g_string_truncate (ctx
->prop_value
, 0);
363 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
365 rb_set_channel_value (ctx
, "img", ctx
->prop_value
->str
);
366 ctx
->state
= RB_PODCAST_PARSER_STATE_IMG
;
367 g_string_truncate (ctx
->prop_value
, 0);
371 case RB_PODCAST_PARSER_STATE_IMG
:
372 ctx
->state
= RB_PODCAST_PARSER_STATE_CHANNEL
;
375 case RB_PODCAST_PARSER_STATE_END
:
379 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
386 rb_podcast_parser_characters (struct RBPoadcastLoadContext
*ctx
,
390 switch (ctx
->state
) {
391 case RB_PODCAST_PARSER_STATE_CHANNEL_PROPERTY
:
392 case RB_PODCAST_PARSER_STATE_ITEM_PROPERTY
:
393 case RB_PODCAST_PARSER_STATE_IMG_PROPERTY
:
394 g_string_append_len (ctx
->prop_value
, data
, len
);
396 case RB_PODCAST_PARSER_STATE_START
:
397 case RB_PODCAST_PARSER_STATE_IMG
:
398 case RB_PODCAST_PARSER_STATE_RSS
:
399 case RB_PODCAST_PARSER_STATE_CHANNEL
:
400 case RB_PODCAST_PARSER_STATE_ITEM
:
401 case RB_PODCAST_PARSER_STATE_END
:
404 g_warning ("Unknown podcast parser state: %d", ctx
->state
);
411 rb_podcast_parse_load_feed (RBPodcastChannel
*data
,
412 const char *file_name
)
414 xmlParserCtxtPtr ctxt
;
415 xmlSAXHandlerPtr sax_handler
= NULL
;
416 GnomeVFSResult result
;
417 GnomeVFSFileInfo
*info
;
419 gchar
*buffer
= NULL
;
421 struct RBPoadcastLoadContext
*ctx
= NULL
;
423 data
->url
= xmlCharStrdup (file_name
);
425 if (!g_str_has_suffix (file_name
, ".rss") && !g_str_has_suffix (file_name
, ".xml")) {
426 gboolean invalid_mime_type
;
428 info
= gnome_vfs_file_info_new ();
430 result
= gnome_vfs_get_file_info (file_name
, info
, GNOME_VFS_FILE_INFO_DEFAULT
);
433 && info
->mime_type
!= NULL
434 && strstr (info
->mime_type
, "xml") == NULL
435 && strstr (info
->mime_type
, "rss") == NULL
) {
436 invalid_mime_type
= TRUE
;
438 invalid_mime_type
= FALSE
;
441 if ((result
!= GNOME_VFS_OK
)) {
442 rb_debug ("Invalid mime-type in podcast feed %s", info
->mime_type
);
443 gnome_vfs_file_info_unref (info
);
447 if (invalid_mime_type
) {
450 GDK_THREADS_ENTER ();
451 dialog
= gtk_message_dialog_new (NULL
, 0,
452 GTK_MESSAGE_QUESTION
,
454 _("The URL '%s' does not appear to be a podcast feed. "
455 "It may be the wrong URL, or the feed may be broken. "
456 "Would you like Rhythmbox to attempt to use it anyway?"),
459 if (gtk_dialog_run (GTK_DIALOG (dialog
)) == GTK_RESPONSE_YES
)
460 invalid_mime_type
= FALSE
;
462 gtk_widget_destroy (dialog
);
463 GDK_THREADS_LEAVE ();
466 gnome_vfs_file_info_unref (info
);
468 if (invalid_mime_type
)
472 /* first download file by gnome_vfs for use gnome network configuration */
473 result
= gnome_vfs_read_entire_file (file_name
, &file_size
, &buffer
);
474 if (result
!= GNOME_VFS_OK
)
478 /* initializing parse */
479 sax_handler
= g_new0 (xmlSAXHandler
, 1);
480 sax_handler
->startElement
= (startElementSAXFunc
) rb_podcast_parser_start_element
;
481 sax_handler
->endElement
= (endElementSAXFunc
) rb_podcast_parser_end_element
;
482 sax_handler
->characters
= (charactersSAXFunc
) rb_podcast_parser_characters
;
483 xmlSubstituteEntitiesDefault (1);
485 ctx
= g_new0 (struct RBPoadcastLoadContext
, 1);
486 ctx
->in_unknown_elt
= 0;
487 ctx
->channel_data
= data
;
488 ctx
->prop_value
= g_string_sized_new (512);
490 ctxt
= xmlCreateMemoryParserCtxt (buffer
, file_size
);
492 g_free (sax_handler
);
494 g_string_free (ctx
->prop_value
, TRUE
);
500 ctxt
->userData
= ctx
;
501 ctxt
->sax
= sax_handler
;
502 xmlParseDocument (ctxt
);
504 g_free (sax_handler
);
506 xmlFreeParserCtxt (ctxt
);
509 g_string_free (ctx
->prop_value
, TRUE
);
512 data
->posts
= g_list_reverse (data
->posts
);
517 rb_podcast_parse_date (const char *date_str
)
522 /* RFC 2822 date format */
523 result
= strptime (date_str
, "%a, %d %b %Y %T", &tm
);
525 /* same as above, but without comma */
526 if (result
== NULL
) {
527 memset (&tm
, 0, sizeof (struct tm
));
528 result
= strptime (date_str
, "%a %d %b %Y %T", &tm
);
531 /* close-to-RFC 2822, but with extra 0 */
532 if (result
== NULL
) {
533 memset (&tm
, 0, sizeof (struct tm
));
534 result
= strptime (date_str
, "%a, %d %b %Y 0%T", &tm
);
537 /* format without weekday */
538 if (result
== NULL
) {
539 memset (&tm
, 0, sizeof (struct tm
));
540 result
= strptime (date_str
, "%d %b %Y %T", &tm
);
543 /* reversed day and long month */
544 if (result
== NULL
) {
545 memset (&tm
, 0, sizeof (struct tm
));
546 result
= strptime (date_str
, "%a, %B %d %Y %T", &tm
);
550 if (result
== NULL
) {
551 memset (&tm
, 0, sizeof (struct tm
));
552 result
= strptime (date_str
, "%Y-%m-%d %T", &tm
);
555 /* ISO date like without timezone */
556 if (result
== NULL
) {
557 memset (&tm
, 0, sizeof (struct tm
));
558 result
= strptime (date_str
, "%Y-%m-%d", &tm
);
561 /* Broken weekday short names */
562 if (result
== NULL
) {
565 /* strip off the erroneous weekday */
566 tmp
= strstr (date_str
, ",");
569 memset (&tm
, 0, sizeof (struct tm
));
570 result
= strptime (tmp
, "%d %b %Y %T", &tm
);
574 /* format with timezone offset from GMT */
575 if (result
== NULL
) {
576 memset (&tm
, 0, sizeof (struct tm
));
577 result
= strptime (date_str
, "%a %b %d %T %z %Y", &tm
);
580 /* format with timezone name */
581 if (result
== NULL
) {
584 memset (&tm
, 0, sizeof (struct tm
));
586 /* match first part of time string */
587 result
= strptime (date_str
, "%a %b %d %T ", &tm
);
589 /* look for anything with a timezone name-like format
590 i.e. at least one all caps alphabetical character */
591 if (result
!= NULL
) {
594 n
= strspn(result
, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
597 /* make sure there was at least one character that matched */
598 if ((tmp
!= NULL
) && n
> 0)
599 /* remaining part must be the year */
600 result
= strptime (tmp
, "%Y", &tm
);
606 if (result
== NULL
) {
607 memset (&tm
, 0, sizeof (struct tm
));
608 rb_debug ("unable to convert date string %s", date_str
);
611 return (uintmax_t) mktime (&tm
);
615 rb_podcast_parse_time (const char *time_str
)
620 memset (&tm
, 0, sizeof (struct tm
));
621 result
= strptime (time_str
, "%H:%M:%S", &tm
);
622 if (result
== NULL
) {
623 memset (&tm
, 0, sizeof (struct tm
));
624 result
= strptime (time_str
, "%M:%S", &tm
);
626 if (result
== NULL
) {
627 memset (&tm
, 0, sizeof (struct tm
));
628 rb_debug ("unable to convert duration string %s", time_str
);
631 return ((tm
.tm_hour
* 60 + tm
.tm_min
) * 60 + tm
.tm_sec
);
635 rb_podcast_parse_channel_free (RBPodcastChannel
*data
)
637 g_return_if_fail (data
!= NULL
);
639 g_list_foreach (data
->posts
, (GFunc
) rb_podcast_parse_item_free
, NULL
);
640 g_list_free (data
->posts
);
644 g_free (data
->title
);
646 g_free (data
->subtitle
);
647 g_free (data
->summary
);
648 g_free (data
->description
);
649 g_free (data
->author
);
650 g_free (data
->contact
);
652 g_free (data
->copyright
);
659 rb_podcast_parse_item_free (RBPodcastItem
*item
)
661 g_return_if_fail (item
!= NULL
);
663 g_free (item
->title
);
665 g_free (item
->description
);