lib/CXGN/Cluster/ClusterSet.pm

   1
   2
   3
   4 =head1 NAME CXGN::Cluster::ClusterSet
   5
   6 CXGN::Cluster::ClusterSet - a package to manage sets of preclusters
   7
   8 =head1 DESCRIPTION
   9
  10 This package keeps track of all the sequence ids that are being clustered, and builds hashes for fast access to determine if a sequence is already in a cluster. It has a function, add_match, that takes two ids and adds them to existing clusters or creates a new cluster if applicable.
  11
  12 =head1 AUTHOR
  13
  14 Lukas Mueller <lam87@cornell.edu>
  15
  16 =cut
  17
  18 use strict;
  19
  20 package CXGN::Cluster::ClusterSet;
  21
  22 use base qw( CXGN::Cluster::Object );
  23
  24 =head2 new()
  25
  26  Usage:        my $cluster_set = CGN::Cluster::ClusterSet->new()
  27  Desc:         Constructor
  28  Ret:          a cluster set object
  29  Args:         none
  30  Side Effects: none
  31
  32 =cut
  33
  34 sub new {
  35     my $class = shift;
  36     my $self = $class -> SUPER::new(@_);
  37     $self->reset_unique_key();
  38     keys(%{$self->{key_hash}})=100000;
  39     return $self;
  40 }
  41
  42 =head2 add_match()
  43
  44  Usage:        $cluster_set -> add_match($query_id, $subject_id)
  45  Desc:         this function checks if the query and subject ids are
  46                already in clusters; if they are in the same cluster
  47                the function does nothing, if only subject id or query
  48                id are in a cluster, the other is added to that same
  49                cluster, and if both are in different clusters, the two
  50                clusters are pulled together.
  51  Ret:
  52  Args:         two ids representing the match.
  53  Side Effects:
  54  Example:
  55
  56 =cut
  57
  58
  59 sub add_match {
  60     my $self = shift;
  61     my $query_id = shift;
  62     my $subject_id = shift;
  63 #    $self->set_debug(1);
  64     if (!$query_id || !$subject_id) {
  65         $self->debug("Ignoring $query_id and $subject_id - incomplete match.\n");
  66         return;
  67     }
  68
  69     my $c1 = $self->get_cluster($query_id);
  70     my $c2 = $self->get_cluster($subject_id);
  71     if ( ($c1 && $c2) && ($c1 == $c2) ) {
  72         $self->debug(" [ ignoring both in ".$c1->get_unique_key()."]\n");
  73         # do nothing, because both have already been added
  74         # to the same precluster
  75     }
  76     elsif ( ($c1 && $c2) && ($c1 != $c2) ) {
  77         $self->debug("IDs already in distinct clusters. Combining...\n");
  78         # we have a problem because the two have
  79         # already been assigned to distinct sub-clusters.
  80         # we need to pull the clusters together.
  81         $self->debug("Before combining: "
  82                      .$c1->get_unique_key().":".$c1->get_size().
  83                      " ".$c2->get_unique_key().":".$c2->get_size()."\n");
  84         $c1->combine($c2);
  85         $self->debug("After combining: "
  86                      .$c1->get_unique_key().":".$c1->get_size()."\n");
  87     }
  88     elsif ($c1 && !$c2) {
  89         $self->debug("query $query_id already in cluster [".$c1->get_unique_key()."], adding $subject_id\n");
  90         $c1->add_member($subject_id);
  91         $self->debug("Now containing ".$c1->get_size()." members.\n");
  92     }
  93     elsif (!$c1 && $c2) {
  94         $self->debug("subject $subject_id already in cluster [".$c2->get_unique_key()."], adding $query_id\n");
  95         $c2->add_member($query_id);
  96     }
  97     else {
  98         $self->debug("creating new cluster...\n");
  99         # there is no cluster yet...
 100         # generate a new cluster
 101         my $new = CXGN::Cluster::Precluster->new($self);
 102         $self->add_cluster($new);
 103         $new->add_member($query_id);
 104         $new->add_member($subject_id);
 105         $new->debug($self->get_debug());
 106     }
 107 }
 108
 109 =head2 add_cluster()
 110
 111  Usage:        $cluster_set->add_cluster($c)
 112  Desc:         adds a new cluster, $c, to the cluster set.
 113                The cluster will be tracked through the
 114                internal cluster key hash for fast access.
 115  Ret:          nothing meaningful
 116  Args:         a CXGN::Cluster::Precluster to add to this set
 117
 118 =cut
 119
 120 sub add_cluster {
 121     my $self = shift;
 122     my $cluster = shift;
 123 #    my $unique_key = $cluster->get_unique_key();
 124     $self->add_key_hash($cluster);
 125 }
 126
 127 sub add_key_hash {
 128     my $self = shift;
 129     my $cluster = shift;
 130     my $unique_key = $cluster->get_unique_key();
 131     $self->{key_hash}{$unique_key}=$cluster;
 132 }
 133
 134 =head2 remove_cluster()
 135
 136  Usage: $set->remove_cluster($c);
 137  Desc:  remove the cluster from this set
 138  Ret:   nothing meaningful
 139  Args:  CXGN::Cluster::Precluster to remove
 140
 141 =cut
 142
 143 sub remove_cluster {
 144     my $self = shift;
 145     my $cluster = shift;
 146     $self->debug("Deleting cluster ".$cluster->get_unique_key()."...\n");
 147     delete($self->{key_hash}{$cluster->get_unique_key()});
 148 }
 149
 150 =head2 get_clusters()
 151
 152  Usage: $set->get_clusters
 153  Desc: gets all clusters as list of CXGN::Cluster::Precluster
 154        objects
 155  Ret:  list of CXGN::Cluster::Precluster objects, in ascending
 156        order of number of members
 157  Args: nothing
 158
 159 =cut
 160
 161 sub get_clusters {
 162     my $self = shift;
 163
 164     return sort {$a->get_member_count <=> $b->get_member_count}
 165            values %{$self->{key_hash}};
 166 }
 167
 168 =head2 add_id()
 169
 170  Usage:
 171  Desc:         add a seq id to the cluster hash for fast
 172                cluster retrieval using a seq id.
 173  Ret:
 174  Args:
 175  Side Effects:
 176  Example:
 177
 178 =cut
 179
 180 sub add_id {
 181     my $self = shift;
 182     my $cluster = shift;
 183     my $id = shift;
 184     if (!$id || !$cluster) { die "need cluster object and id"; }
 185     $self->{id_map}{$id}=$cluster;
 186 }
 187
 188 =head2 get_cluster()
 189
 190  Usage:
 191  Desc:         gets the cluster that contains the sequence with id $id.
 192                see also add_id().
 193  Ret:
 194  Args:
 195  Side Effects:
 196  Example:
 197
 198 =cut
 199
 200 sub get_cluster {
 201     my $self = shift;
 202     my $id = shift;
 203     if (!$id) { die "get_cluster: need an id!\n"; }
 204     return $self->{id_map}{$id};
 205 }
 206
 207 =head2 generate_unique_key()
 208
 209  Usage:        my $key = $cluster_set->generate_unique_key()
 210  Desc:         returns a new unique key for a cluster. Unique
 211                keys are simply generated by added 1 to the previous
 212                key.
 213  Ret:
 214  Args:
 215  Side Effects:
 216  Example:
 217
 218 =cut
 219
 220 sub generate_unique_key {
 221     my $self=shift;
 222     return ($self->{unique_key})++;
 223 }
 224
 225 =head2 reset_unique_key()
 226
 227  Usage:        $cluster_set->reset_unique_key()
 228  Desc:         resets the unique key to 0. Should not
 229                be called during normal operation, because
 230                keys will not be unique anymore.
 231  Ret:
 232  Args:
 233  Side Effects:
 234  Example:
 235
 236 =cut
 237
 238 sub reset_unique_key {
 239     my $self=shift;
 240     $self->{unique_key}=0;
 241 }
 242
 243 return 1;