ICQ process_message must also call escapeHTML.
[thrasher.git] / perl / lib / Thrasher / XMPPStreamIn.pm
blob5626aa8021656df1fa1d81d6df397e5e960cacd4
1 package Thrasher::XMPPStreamIn;
3 =head1 NAME
5 Thrasher::XMPPStream - wrap XML::Parser for XMPP stream parsing
7 =head1 SYNOPSIS
9 my $stream = new Thrasher::XMPPStream;
11 # messages as described below
12 my $messages = $stream->parse($some_text);
13 my $more_messages = $stream->parse($more_text_in_stream);
15 # dies
16 my $yet_more_messages = $stream->parse($invalid_xml);
18 =head1 DESCRIPTION
20 Thrasher::XMPPStream converts from the Jabber XML stream into
21 perl objects representing the events from the XML stream.
23 This abstracts out managing the XML::Parser correctly.
25 =head2 Output format
27 Events are output as array refs with three elements, containing the
28 following:
30 =over 4
32 =item *
34 B<Tag name reference>: The full name of the tag. See "Tags" section
35 below.
37 =item *
39 B<Attributes hash>: A hash of full attribute names to their
40 values. See "Tags" section below.
42 =item *
44 B<Children>: The children of this tag. The children are each one of
45 two things: A string, representing character data, or another tag
46 arrayref as described here.
48 =back
50 =head2 Tags
52 Thrasher::XMPPStream correctly handles namespaces; therefore, all
53 tags and attribute names actually have two parts, the namespace
54 and the tag name itself.
56 Tags are typically represented as an array ref, with the first part
57 representing the full name of the namespace (NOT the prefix,
58 like "stream", the whole URI, like
59 "http://etherx.jabber.org/streams"), and the second part representing
60 the tag name.
62 Attributes are represented in "Clark-style" attribute names, as
63 XML::SAX's documentation calls it, which is the namespace in
64 {}, followed by the tag/attribute name. Note that this means
65 that an attribute with no explicit namespace will be
66 C<{}attname>, not C<attname>.
68 =cut
70 use strict;
71 use warnings;
73 use XML::SAX;
74 $XML::SAX::ParserPackage = "XML::SAX::Expat::Incremental";
75 use XML::SAX::ParserFactory;
77 use Data::Dumper;
78 use Carp qw(confess);
80 use Thrasher::XML qw(:all);
82 use base 'XML::SAX::Base', 'Exporter';
84 use Encode;
86 # Just export the non-methods.
87 our @EXPORT_OK = qw(get_parser);
88 our %EXPORT_TAGS = (all => \@EXPORT_OK);
90 sub get_parser {
91 my $self = Thrasher::XMPPStreamIn->new();
92 my $parser = XML::SAX::ParserFactory->parser
93 (Handler => $self);
94 $self->{parser} = $parser;
95 return $self;
98 sub new {
99 my $self = {_debug => 1};
100 bless $self, 'Thrasher::XMPPStreamIn';
102 my $handlers = {Start => sub { $self->tag_start(@_) },
103 End => sub { $self->tag_end(@_) },
104 Char => sub { $self->char(@_) },
105 XMLDecl => sub { $self->xml_decl(@_) }};
107 # Consists of a list of prefixes->array ref of strings,
108 # where the last string is the current namespace for that
109 # prefix.
110 $self->{namespaces} = {};
112 # this stores up the messages as they are generated, so the
113 # end of the "parse" call can return them.
114 $self->{messages} = [];
116 return $self;
119 sub start_element {
120 my ($self, $el) = @_;
122 my $atts = $el->{Attributes};
123 while (my ($name, $att_data) = each %$atts) {
124 # Filter out namespace declarations
125 if ($name =~ m|http://www.w3.org/2000/xmlns| ||
126 $name eq '{}xmlns') {
127 delete $atts->{$name};
128 next;
131 # If it's not a namespace declaration, cut the info
132 # down to the value
133 $atts->{$name} = $atts->{$name}->{Value};
136 my $element = [$el->{NamespaceURI}, $el->{LocalName}];
138 # As a special case, if this is the initial stream tag,
139 # it constitutes a message
140 if (same_name(['stream', 'stream'], $element)) {
141 # pretend the children are empty
142 push @{$self->{messages}}, [$element, $atts, []];
143 } else {
144 # Otherwise, we need to remember it
145 if (!exists($self->{current_message})) {
146 $self->{children} = [[]];
147 $self->{current_message} = [$element, $atts,
148 $self->{children}->[0]];
149 } else {
150 my $new_children = [];
151 my $current_children = $self->{children}->[-1];
152 push @{$self->{children}}, $new_children;
153 my $new_element = [$element, $atts,
154 $new_children];
155 push @$current_children, $new_element;
160 sub end_element {
161 my ($self, $el) = @_;
163 if (!scalar@{$self->{children}}) {
164 return; # got a final </stream:stream>
167 # Return to the previous tag's children.
168 pop @{$self->{children}};
170 # If we've received a complete message, remember it
171 # so we can return it.
172 if (!scalar(@{$self->{children}})) {
173 my $message = $self->{current_message};
174 push @{$self->{messages}}, $message;
175 delete $self->{current_message};
179 sub parse {
180 my ($self, $xml) = @_;
182 $self->{parser}->parse_string($xml);
184 my $messages = $self->{messages};
185 $self->{messages} = [];
186 return $messages;
189 sub dump {
190 my $self = shift;
191 if ($self->{_debug}) {
192 print Dumper(@_);
193 print "\n";
197 sub att_name {
198 my $self = shift;
199 my $att_name = shift;
200 my $name = $self->convert_name($att_name);
201 return '{' . $name->[0] . '}' . $name->[1];
204 sub convert_name {
205 my $self = shift;
206 my $name = shift;
208 my ($left, $right) = split /:/, $name, 2;
209 if (defined($right)) {
210 return [$self->namespace_for_prefix($left), $right];
211 } else {
212 return [$self->namespace_for_prefix(''), $left];
216 sub namespace_for_prefix {
217 my $self = shift;
218 my $prefix = shift;
220 my $namespaces = $self->{namespaces}->{$prefix};
221 if (defined($namespaces) && scalar(@$namespaces) > 0) {
222 return $namespaces->[-1];
225 return '';
228 sub characters {
229 my ($self, $el_data) = @_;
231 my $chars = $el_data->{Data};
232 my $current_children = $self->{children}->[-1];
233 if (!defined($current_children)) {
234 # We're still not to the first message yet.
235 return;
238 if (scalar(@$current_children) && ref($current_children->[-1]) eq '') {
239 $current_children->[-1] = $current_children->[-1] . $chars;
240 } else {
241 push @$current_children, $chars;