ICQ process_message must also call escapeHTML.
[thrasher.git] / perl / lib / Thrasher / XML.pm
blobb44b2c4e765d3dc3a18625c29a2aea71496dad98
1 package Thrasher::XML;
2 use strict;
3 use warnings;
5 use base 'Exporter';
7 BEGIN {
8 our @EXPORT_OK = qw(iq_reply error_tag iq_error feature
9 has_subtags no_match strip_resource
10 same_name normalize_name get_attr
11 extract multi_extract save save_match save_sub
12 spec_is_optional take text_extractor
13 extract_disco_items extract_disco_info
14 recursive_extract);
15 our %EXPORT_TAGS = (all => \@EXPORT_OK);
18 use Thrasher::Constants qw(:all);
19 use Thrasher::Log qw(:all);
20 use Carp qw(confess);
21 use Data::Dumper;
23 my %namespaces = (stream => 'http://etherx.jabber.org/streams',
24 '' => 'jabber:client', # 'default" namespace
27 =pod
29 =head1 NAME
31 Thrasher::XML - a collection of functions for manipulating XML
33 =cut
35 =head1 CONVENIENCE FUNCTIONS
37 Several convenience functions for working with namespaced tag names
38 are provided.
40 =over 4
42 =item *
44 C<same_name>($l, $r): Given two names, returns true if they are
45 the same name, false otherwise. This function calls C<normalize_name>
46 on both arguments for you.
48 =cut
50 # Compare two names, as represented in this code, to see if they are
51 # the same. If the first element exists in %namespaces, the full
52 # namespace will be substituted. If the value is a string, it will
53 # be in the default namespace, which for streams is 'jabber:client'.
54 sub same_name {
55 my $left = shift;
56 my $right = shift;
58 $left = normalize_name($left);
59 $right = normalize_name($right);
61 return $left->[0] eq $right->[0] && $left->[1] eq $right->[1];
64 =pod
66 =item *
68 C<normalize_name>($name): Normalizes the given name according to the
69 following rules:
71 =over 4
73 =item *
75 If the name is a "Clark-style" URI+name {uri}name, breaks out
76 the uri and the name separately.
78 =item *
80 If the name is a simple string, the name is returned as being in
81 the default namespace ("jabber:client").
83 =item *
85 If the name is a two-element array, and the namespace has an entry
86 in the internal %namespaces hash, the entry in the hash will
87 replace the namespaces. This is why you can so ofter refer to
88 ['stream', 'stream'] instead of ['http://etherx.jabber.org/streams',
89 'stream'].
91 =item *
93 If the name is a two-element array that didn't meet the prior
94 condition, the array is returned.
96 =item *
98 If the name makes it this far, the function dies with a descriptive
99 error message showing what you passed in.
101 =back
103 You shouldn't really need to call this function, everything in this
104 module tends to call it at every opportunity.
106 Also note that this is only legit because the shortcuts are
107 hard-coded, not pulled from the XML itself. Using actual
108 prefixes in the XML would be wrong.
110 =cut
112 sub normalize_name {
113 my $name = shift;
115 if (!ref($name)) {
116 if ($name =~ /^\{([^}]+)\}(.*)$/) {
117 return [$1, $2];
119 return [$namespaces{''}, $name];
122 if (ref($name) eq 'ARRAY') {
123 my ($namespace, $tag) = @$name;
124 if (exists($namespaces{$namespace})) {
125 $namespace = $namespaces{$namespace};
127 return [$namespace, $tag];
130 confess "Invalid parameter passed to normalize_name:\n"
131 . Dumper($name);
134 =pod
136 =item *
138 C<get_attr>($attr_name, $attrs_hash): Get the value of an attribute
139 from an attribute list, using the correct attribute name. $attr_name
140 is passed through normalize name.
142 =cut
144 sub get_attr {
145 my $name = shift;
146 my $atts = shift;
148 $name = normalize_name($name);
149 $name = '{' . $name->[0] . '}' . $name->[1];
150 return $atts->{$name};
153 =pod
155 C<extract>($xml_pattern, $xml): Extracts values from processed XML
156 into a hash for your consumption, or dies if the xml_pattern doesn't
157 match, using a pattern matching syntax.
159 The XML pattern is a structure that looks like the structure
160 we use for XML, and matches in the following way:
162 =over 4
164 =item * If a field is undef, the field must exist, but may
165 be anything.
167 =item * If the field is a placeholder, the field must exist,
168 but may be anything.
170 =item * If the field is a constant value, it must exactly match
171 the given constant value.
173 =item * If the field is a variable specification, as described below,
174 the field must match the match specification passed in as a parameter
175 to C<extract>.
177 =item * Fields not defined in the match clause at all are ignored.
179 =back
181 As a special case, for children specifications, the match will
182 succeed if I<any> of the children match, and for placeholders
183 with conditions, any matching children will be extracted.
185 If the match is successful, a hashref will be returned with the
186 result of all the placeholders, according to the names given.
187 If you had no placeholders (pure matching), then the hashref
188 will be empty. Otherwise, the placeholders will receive whatever
190 This is a powerful method, but I'm going to wave at the other code
191 and say to look at the examples for usage; if you need more docs,
192 ask for them.
194 As a special exeption, if the $xml_pattern is undef, the function
195 simply returns $xml, implementing a "match everything" clause
196 that is useful for error cases.
198 The variables and placeholders that you can use:
200 =over
202 =item C<save>($name): The basic placeholder that will
203 simply return the contents with no further processing.
205 =item C<save_match>([$name,] $match): Recursively calls
206 match. If this fails (returns undef), the entire match
207 fails; if it succeeds, the values in the matched hash
208 are merged into the final returned hash.
210 If used in a children match, this will return only the
211 first such matching result, useful for extracting subelements.
213 If no name is given, it will only be used to match, it
214 won't save the match specifically. (This can be used to
215 recursively match and save inner elements.)
217 =item C<save_sub>($name, $sub): Calls the sub with
218 the result, returns the result of the I<sub> as the result
219 of the match.
221 If used in the children match spec, where this will collect
222 all sub runs that don't return "undef" into the resulting
223 arrayref, and fail if no children match the spec.
225 =item C<take>($name): This requires that you pass in a
226 value for taking to the "extract" hash, which will be
227 treated exactly as if it were in the original specification,
228 right down to allowing you to return values. This allows
229 us to factor out constant or repeating patterns and avoid
230 constructing and garbage collecting them over and over.
232 =back
234 =cut
236 sub extract {
237 my $xml_pattern = shift;
238 my $xml = shift;
239 my $provided_vars = shift || {};
241 # undefined $xml_pattern matches everything
242 if (!defined($xml_pattern)) {
243 return $xml;
246 if (!ref($xml)) {
247 return undef;
250 if (!defined($xml_pattern)) {
251 return $xml;
254 my $final_hash = {};
256 my ($pat_element, $pat_atts, $pats_children) = @$xml_pattern;
257 my ($element, $atts, $children) = @$xml;
259 my $string_equal = sub { $_[0] eq $_[1] };
261 my $extract_fail_reason = '';
262 my $returned;
264 my $compare;
265 $compare = sub {
266 my $match_spec = shift;
267 my $match_target = shift;
268 my $comparator = shift || $string_equal;
270 if (!defined($match_spec)) {
271 return $match_target;
272 } elsif (ref($match_spec) eq
273 'Thrasher::XMPPStreamIn::Placeholder') {
274 $final_hash->{$match_spec->{name}} = $match_target;
275 return $match_target || $match_spec->{optional};
276 } elsif (ref($match_spec) eq
277 'Thrasher::XMPPStreamIn::Variable') {
278 return $compare->($provided_vars->{$$match_spec},
279 $match_target,
280 $comparator);
281 } elsif(ref($match_spec) eq
282 'Thrasher::XMPPStreamIn::Placeholder::Match') {
283 my $match = $match_spec->{match};
284 my $result = extract($match, $match_target, $provided_vars);
286 if (!defined($result) && !$result->{optional}) {
287 $extract_fail_reason = "Matching XML against a string.";
288 return undef;
289 } else {
290 # failure automatically propogates as a die.
291 while (my ($key, $value) = each %$result) {
292 $final_hash->{$key} = $value;
294 return $match_target;
296 } elsif (ref($match_spec) eq
297 'Thrasher::XMPPStreamIn::Placeholder::Sub') {
298 my $name = $match_spec->{name};
299 my $subref = $match_spec->{subref};
300 my $result = $subref->($match_target);
301 if (!defined($result)) {
302 $extract_fail_reason = 'Subroutine returned an undef.';
303 return undef;
305 if (defined($name)) {
306 $final_hash->{$name} = $result;
308 return $result;
309 } else {
310 return $comparator->($match_spec, $match_target) ?
311 $match_target : undef;
315 # Clean up after the closure, which tends to confuse perl
316 # GC
317 my $cleanup;
318 $cleanup = sub { undef $compare; undef $cleanup; undef $string_equal; };
320 ELEMENT_NAME: {
321 my $element_compare = sub {
322 my ($pat_namespace, $pat_element_name) = @{shift()};
323 my ($namespace, $element_name) = @{shift()};
325 if (defined($pat_namespace) &&
326 $pat_namespace ne $namespace) {
327 $extract_fail_reason =
328 "Namespace mismatch: $pat_namespace ne $namespace";
329 return 0; # fail
332 if (defined($pat_element_name) &&
333 $pat_element_name ne $element_name) {
334 $extract_fail_reason =
335 "Element name mismatch: $pat_element_name ne $element_name";
336 return 0;
339 return 1;
342 $element = normalize_name($element);
343 my $result = $compare->($pat_element, $element,
344 $element_compare);
345 undef $element_compare;
347 # Upgrade undefs to a die
348 if (!$result) {
349 $cleanup->();
350 confess "In element name: $extract_fail_reason";
354 ATTRIBUTES: {
355 if (!defined($pat_atts)) {
356 # deliberately blank; no action
357 } else {
358 while (my ($key, $value) = each %$pat_atts) {
359 # verify existence in the target atts
360 # before passing off to $compare
361 if (substr($key, 0, 1) ne '{') {
362 $key = '{}' . $key;
365 if (!exists($atts->{$key})) {
366 if (spec_is_optional($value)) {
367 next;
369 $cleanup->();
370 confess "Attribute $key unexpectedly didn't exist.";
372 if (!$compare->($value, $atts->{$key})) {
373 $cleanup->();
374 confess "In attribute: $extract_fail_reason";
380 my $match_children1 = sub {
381 my ($pat_children) = @_;
383 my $final_children = [];
384 for my $child (@$children) {
385 my $match_result = eval {
386 $compare->($pat_children,
387 $child);
389 if (!$@ && defined($match_result)) {
390 if (ref($pat_children) eq
391 'Thrasher::XMPPStreamIn::Placeholder::Match' &&
392 $pat_children->{name}) {
393 return $match_result;
395 push @$final_children, $match_result;
399 if (!@$final_children && !spec_is_optional($pat_children)) {
400 $cleanup->();
401 confess "In attempting to match children, none matched.";
403 # Transfer the matching results into the name for the sub if needed
404 if (ref($pat_children) eq 'Thrasher::XMPPStreamIn::Placeholder::Sub') {
405 return $final_children;
407 elsif (ref($pat_children) eq 'Thrasher::XMPPStreamIn::Placeholder') {
408 return $final_children;
410 else {
411 return;
415 CHILDREN: {
416 if (defined($pats_children)) {
417 if (ref($pats_children) ne 'ARRAY') {
418 $pats_children = [ $pats_children ];
420 elsif (scalar(@{$pats_children}) == 0) {
421 # Special case: [] means "no children"
422 if (scalar(@{$children}) != 0) {
423 $cleanup->();
424 confess('Match wanted no children.');
426 last CHILDREN;
428 for my $pat_children (@{$pats_children}) {
429 my $to_save = $match_children1->($pat_children);
430 if ($to_save) {
431 $final_hash->{$pat_children->{name}} = $to_save;
437 $cleanup->();
438 return $final_hash;
441 # Taking in pairs of extract_specifications and sub actions,
442 # try multiple matches at a time, dying only if they all fail.
443 # Put the most likely success stanza up front.
444 sub multi_extract {
445 # assumes none of the matches will change $xml_message
446 my $xml_message = shift;
448 while (@_) {
449 my $match = shift @_;
450 my $action = shift @_;
451 if (!defined($action)) {
452 confess "Match rule given without action for multi_extract.";
454 my $result;
456 local $@;
457 eval {
458 $result = extract($match, $xml_message);
461 if (defined($result)) {
462 return $action->($result);
466 confess("In multi-extract, no matches found. Message: ".
467 Dumper($xml_message));
470 sub save ($;$) {
471 my $name = shift;
472 my $optional = shift;
473 my $self = {name => $name,
474 optional => $optional};
475 return bless $self, 'Thrasher::XMPPStreamIn::Placeholder';
478 sub save_match ($;$$) {
479 my $match;
480 my $name;
481 my $optional;
482 if (@_ >= 2) {
483 $name = shift;
484 $match = shift;
485 $optional = shift;
486 } else {
487 $match = shift;
488 $name = '';
490 my $self = {match => $match, name => $name, optional => $optional};
491 bless $self, 'Thrasher::XMPPStreamIn::Placeholder::Match';
494 sub save_sub ($$;$) {
495 my $name = shift;
496 my $subref = shift;
497 my $optional = shift;
498 my $var_spec = {name => $name, subref => $subref, optional => $optional};
499 bless $var_spec, 'Thrasher::XMPPStreamIn::Placeholder::Sub';
502 sub spec_is_optional {
503 my $spec = shift;
504 return (ref($spec) eq 'Thrasher::XMPPStreamIn::Placeholder::Match'
506 ref($spec) eq 'Thrasher::XMPPStreamIn::Placeholder'
508 ref($spec) eq 'Thrasher::XMPPStreamIn::Placeholder::Sub')
509 && $spec->{optional};
513 sub take ($) {
514 my $name = shift;
515 return bless \$name, 'Thrasher::XMPPStreamIn::Variable';
518 # This does not descend into subtags, by design
519 sub text_extractor {
520 if (!ref($_[0])) { return $_[0] }
521 return undef;
524 =pod
526 Some shortcuts for common specification patterns:
528 =over
530 =back
532 =back
534 =cut
537 # NOTE NOTE NOTE: You pass back the *original* to and from to
538 # this function, with centralizes the reversing of those
539 # two parameters!
540 sub iq_reply {
541 my $self = shift;
542 my $iq_params = shift;
543 my $response = shift;
545 # Reconstruct an IQ response
546 my $iq = [[$NS_COMPONENT, 'iq'],
547 {from => $iq_params->{to},
548 to => $iq_params->{from},
549 id => $iq_params->{id},
550 type => 'result'},
551 ($response ? [$response] : [])];
552 $self->xml_out($iq);
555 sub error_tag {
556 my $error = shift;
558 if (!exists($IQ_ERRORS{$error})) {
559 confess "Undefined error code $error used.";
562 my ($error_code, $error_type, $error_tag) =
563 @{$IQ_ERRORS{$error}};
565 return [[$NS_COMPONENT, 'error'],
566 {code => $error_code,
567 type => $error_type},
568 [[[$NS_ERROR, $error_tag], {}, []]]
572 sub iq_error {
573 my $self = shift;
574 my $iq_params = shift;
575 my $error_id = shift;
577 my $query = $iq_params->{query};
578 my $children = [$query];
580 push @$children, error_tag($error_id);
582 my $iq = [[$NS_COMPONENT, 'iq'],
583 {from => $iq_params->{to},
584 to => $iq_params->{from},
585 id => $iq_params->{id},
586 type => 'error'},
587 $children];
588 $self->xml_out($iq);
591 sub feature { [[$NS_DISCO_INFO, 'feature'], {var => $_[0]}, []] }
593 # Pass the children list;
594 sub has_subtags {
595 my $children = shift;
596 return !!(grep { ref($_) } @$children);
599 # Returns a multi_extract clause that matches everything, and
600 # prints off an useful error logging event. This in effect turns
601 # multi_extract into a function that dies when unexpected stuff
602 # occurs, into printing something out on STDERR.
603 # This requires the children to be
604 sub no_match {
605 my $self = shift;
606 my $message = shift;
607 my $iq_params = shift;
608 my @other_params = @_;
609 return undef, sub {
610 log $message . "\n";
611 log Dumper(\@other_params) if @other_params;
612 $self->iq_error($iq_params, 'bad_request');
616 sub strip_resource {
617 my $jid = shift;
618 if (!defined($jid)) {
619 confess "stripping the resource from an undefined JID";
621 $jid =~ s/\/.*$//;
622 return $jid;
625 # Tear apart a disco result, returning an array of identities
626 # [type, category, name], followed by an array of features. Give this
627 # the perl data corresponding to the <query> tag.
628 sub extract_disco_info {
629 my $query = shift;
631 my $disco = extract([undef, undef,
632 save_sub('disco',
633 sub {( ref($_[0]) &&
634 (($_[0]->[0]->[1] eq 'identity') ||
635 ($_[0]->[0]->[1] eq 'feature'))) ?
636 $_[0] : undef},
637 # Don't require children; empty
638 # results may be e.g. an
639 # incompletely started service.
640 1)],
641 $query);
642 my @identities;
643 my @features;
645 for my $feature_or_identity (@{$disco->{disco}}) {
646 my $atts = $feature_or_identity->[1];
647 if ($feature_or_identity->[0]->[1] eq 'identity') {
648 my $identity = $feature_or_identity;
649 my @identity_data =
650 ($atts->{'{}type'}, $atts->{'{}category'},
651 $atts->{'{}name'});
652 push @identities, \@identity_data;
653 } else {
654 my $feature = $feature_or_identity;
655 push @features, $atts->{'{}var'};
659 return \@identities, \@features;
662 sub extract_disco_items {
663 my $query = shift;
665 my $disco = extract([undef, undef,
666 save_sub('disco',
667 sub {( ref($_[0]) &&
668 (($_[0]->[0]->[1] eq 'item'))) ?
669 $_[0] : undef})],
670 $query);
671 my @items = map { $_->[1]->{'{}jid'} } @{$disco->{disco}};
673 return \@items;
676 # Given a list of &extract specifications, recurse down the
677 # list and return the whole chunk of extract results in one
678 # convenient array. Indicate what XML to recurse on by saving the
679 # target XML in the "rec" field of the extract result.
680 sub recursive_extract {
681 my $xml = shift;
683 my $results = {};
684 my $recursion = $xml;
686 while ($recursion && @_) {
687 my $extract_spec = shift @_;
689 my $extract_results;
691 local $@;
692 eval {
693 $extract_results = extract($extract_spec,
694 $recursion);
697 # If there was an error, return what we got.
698 if ($@) {
699 print "Error: $@\n";
700 return $results;
703 if ($extract_results &&
704 @_ &&
705 !exists($extract_results->{rec})) {
706 warn "Further recursive extractions are specified, "
707 ."but no 'rec' element resulted. Did you forget "
708 ."to specify a 'rec'?";
711 $recursion = $extract_results->{rec};
713 if ($extract_results) {
714 while (my ($key, $val) = each %$extract_results) {
715 $results->{$key} = $val;
720 return $results;