fixed get stocks functions
[sgn.git] / lib / CXGN / BreedersToolbox / StocksFuzzySearch.pm
blobda268b11a88740dfa069c6c376a127c27453f47f
1 package CXGN::BreedersToolbox::StocksFuzzySearch;
3 =head1 NAME
5 CXGN::BreedersToolbox::StocksFuzzySearch - an object to find approximate matches in the database to a query list of stock names.
7 =head1 USAGE
9 my $fuzzy_stock_search = CXGN::BreedersToolbox::StocksFuzzySearch->new({schema => $schema});
10 my $fuzzy_search_result = $fuzzy_stock_search->get_matches(\@stock_list, $max_distance, $stock_type);
12 =head1 DESCRIPTION
15 =head1 AUTHORS
17 Jeremy D. Edwards (jde22@cornell.edu)
19 =cut
21 use strict;
22 use warnings;
23 use Moose;
24 use MooseX::FollowPBP;
25 use Moose::Util::TypeConstraints;
26 use CXGN::String::FuzzyMatch;
27 use SGN::Model::Cvterm;
28 use Data::Dumper;
30 has 'schema' => (
31 is => 'rw',
32 isa => 'DBIx::Class::Schema',
33 required => 1,
37 sub get_matches {
38 my $self = shift;
39 my $stock_list_ref = shift;
40 my $max_distance = shift;
41 my $stock_type = shift;
42 my $schema = $self->get_schema();
43 my @stock_list = @{$stock_list_ref};
44 my %synonym_uniquename_lookup;
45 my $fuzzy_string_search = CXGN::String::FuzzyMatch->new( { case_insensitive => 0} );
46 my @fuzzy_stocks;
47 my @absent_stocks;
48 my @found_stocks;
49 my %results;
50 my $error = '';
51 print STDERR "FuzzySearch 1".localtime()."\n";
53 my $synonym_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
54 my $stock_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
55 my $q = "SELECT stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$stock_type_id";
56 my $h = $schema->storage->dbh()->prepare($q);
57 $h->execute();
58 my %uniquename_hash;
59 while (my ($uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
60 $uniquename_hash{$uniquename}++;
61 if ($type_id){
62 if ($type_id == $synonym_type_id){
63 push @{$synonym_uniquename_lookup{$synonym}}, $uniquename;
68 my @stock_names = keys %uniquename_hash;
69 my @synonym_names = keys %synonym_uniquename_lookup;
70 push (@stock_names, @synonym_names);
72 my @lowercased_names;
73 my %lowercase_name_lookup;
74 foreach (@stock_names){
75 push @lowercased_names, lc($_);
76 $lowercase_name_lookup{lc($_)} = $_;
79 print STDERR "FuzzySearch 2".localtime()."\n";
81 foreach my $stock_name (@stock_list) {
83 if (exists($uniquename_hash{$stock_name})){
84 push @found_stocks, {matched_string => $stock_name, unique_name => $stock_name};
85 next;
88 if (exists($synonym_uniquename_lookup{$stock_name})){
89 my %match_info;
90 if (scalar(@{$synonym_uniquename_lookup{$stock_name}}) > 1){
91 my $synonym_lookup_uniquename = join ',', @{$synonym_uniquename_lookup{$stock_name}};
92 $error .= "This synonym $stock_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
93 next;
94 } elsif (scalar(@{$synonym_uniquename_lookup{$stock_name}}) == 1){
95 $match_info{matched_string} = $stock_name." (SYNONYM OF ".$synonym_uniquename_lookup{$stock_name}->[0].")";
96 $match_info{is_synonym} = 1;
97 $match_info{unique_name} = $synonym_uniquename_lookup{$stock_name}->[0];
99 push @found_stocks, \%match_info;
100 next;
103 my @search_stock_names;
104 foreach (@lowercased_names){
105 #if there is a difference in length greater than 10, it will not fuzzy search over that name
106 if (abs(length($_) - length($stock_name)) <= 10){
107 push @search_stock_names, $_;
111 my @stock_matches = @{$fuzzy_string_search->get_matches(lc($stock_name), \@search_stock_names, $max_distance)};
113 if (scalar(@stock_matches) == 0) {
114 push (@absent_stocks, $stock_name);
115 } else {
116 my @matches;
117 foreach (@stock_matches){
118 my %match_info;
119 my $matched_name = $_->{string};
120 my $original_matched_name = $lowercase_name_lookup{$matched_name};
121 $match_info{'name'} = $original_matched_name;
122 $match_info{'distance'} = $_->{distance};
123 my $synonym_lookup_of_matched_string = $synonym_uniquename_lookup{$original_matched_name} || [];
124 if (scalar(@$synonym_lookup_of_matched_string) > 1){
125 my $synonym_lookup_uniquename = join ',', @$synonym_lookup_of_matched_string;
126 $error .= "This synonym $matched_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
127 next;
128 } elsif (scalar(@$synonym_lookup_of_matched_string) == 1){
129 $match_info{'unique_names'} = [$original_matched_name];
130 $match_info{'is_synonym'} = 1;
131 $match_info{'synonym_of'} = $synonym_lookup_of_matched_string->[0];
132 } else {
133 $match_info{'unique_names'} = [$original_matched_name];
135 push @matches, \%match_info;
137 push @fuzzy_stocks, {
138 name => $stock_name,
139 matches => \@matches
144 if ($error){
145 print STDERR "FUZZY ERRORS: $error\n";
146 $results{'error'} = $error;
149 $results{'found'} = \@found_stocks;
150 $results{'fuzzy'} = \@fuzzy_stocks;
151 $results{'absent'} = \@absent_stocks;
152 return \%results;