speed up fuzzy accession search
[sgn.git] / lib / CXGN / BreedersToolbox / AccessionsFuzzySearch.pm
blob21dd6c37c81ef049743c39a3c760a5d68899023c
1 package CXGN::BreedersToolbox::AccessionsFuzzySearch;
3 =head1 NAME
5 CXGN::BreedersToolbox::AccessionsFuzzySearch - an object to find approximate matches in the database to a query accession name.
7 =head1 USAGE
9 my $fuzzy_accession_search = CXGN::BreedersToolbox::AccessionsFuzzySearch->new({schema => $schema});
10 my $fuzzy_search_result = $fuzzy_accession_search->get_matches(\@accession_list, $max_distance)};
12 =head1 DESCRIPTION
15 =head1 AUTHORS
17 Jeremy D. Edwards (jde22@cornell.edu)
19 =cut
21 use strict;
22 use warnings;
23 use Moose;
24 use MooseX::FollowPBP;
25 use Moose::Util::TypeConstraints;
26 use CXGN::String::FuzzyMatch;
27 use SGN::Model::Cvterm;
28 #use Data::Dumper;
30 has 'schema' => (
31 is => 'rw',
32 isa => 'DBIx::Class::Schema',
33 required => 1,
37 sub get_matches {
38 my $self = shift;
39 my $accession_list_ref = shift;
40 my $max_distance = shift;
41 my $schema = $self->get_schema();
42 my @accession_list = @{$accession_list_ref};
43 my %synonym_uniquename_lookup;
44 my $fuzzy_string_search = CXGN::String::FuzzyMatch->new( { case_insensitive => 0} );
45 my @fuzzy_accessions;
46 my @absent_accessions;
47 my @found_accessions;
48 my %results;
50 my $synonym_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
51 my $accession_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
52 my $q = "SELECT stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$accession_type_id";
53 my $h = $schema->storage->dbh()->prepare($q);
54 $h->execute();
55 my %uniquename_hash;
56 while (my ($uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
57 $uniquename_hash{$uniquename} = 1;
58 if ($type_id){
59 if ($type_id == $synonym_type_id){
60 push @{$synonym_uniquename_lookup{$synonym}}, $uniquename;
65 my @stock_names = keys %uniquename_hash;
66 my @synonym_names = keys %synonym_uniquename_lookup;
67 push (@stock_names, @synonym_names);
68 my %stock_names_hash = map {$_ => 1} @stock_names;
70 foreach my $accession_name (@accession_list) {
71 if (exists($stock_names_hash{$accession_name})){
72 push @found_accessions, {"matched_string" => $accession_name, "unique_name" => $accession_name};
73 next;
75 my @matches;
76 my @accession_matches = @{$fuzzy_string_search->get_matches($accession_name, \@stock_names, $max_distance)};
77 my $more_than_one_perfect_match = 0;
78 my $more_than_one_unique_name_for_synonym = 0;
79 my $has_one_unique_match = 0;
81 if (scalar @accession_matches eq 0) {
82 push (@absent_accessions, $accession_name);
83 } else {
84 my $matched_string = $accession_matches[0]->{'string'};
85 my $synonym_lookup_of_matched_string = $synonym_uniquename_lookup{$matched_string};
87 #Make sure that there isn't more than one perfect match
88 if ($accession_matches[1]) {
89 my $next_matched_string = $accession_matches[1]->{'string'};
90 if ($next_matched_string eq $accession_name) {
91 $more_than_one_perfect_match = 1;
95 #Make sure that there isn't more than one unique name for the searched string if synonym
96 if ($synonym_lookup_of_matched_string) {
97 if (scalar @{$synonym_lookup_of_matched_string} > 1) {
98 $more_than_one_unique_name_for_synonym = 1;
102 #Store accession name to found list if there is one unique match
103 if ( $matched_string eq $accession_name && !$more_than_one_perfect_match && !$more_than_one_unique_name_for_synonym) {
104 my %found_accession_and_uniquename;
105 $found_accession_and_uniquename{'matched_string'} = $accession_name;
107 #when there is a synonym, store the unique name and the searched string
108 if ($synonym_lookup_of_matched_string) {
109 my @unique_names_of_synonym;
110 @unique_names_of_synonym = @{$synonym_lookup_of_matched_string};
111 #should not be more than one unique name for synonym because checked array length earlier
112 $found_accession_and_uniquename{'unique_name'} = $unique_names_of_synonym[0];
113 } else {
114 $found_accession_and_uniquename{'unique_name'} = $accession_name;
116 push (@found_accessions, \%found_accession_and_uniquename);
117 $has_one_unique_match = 1;
120 if (!$has_one_unique_match) {
121 foreach my $match (@accession_matches) {
122 my $matched_name = $match->{'string'};
123 my $distance = $match->{'distance'};
124 my %match_info;
125 $match_info{'name'} = $matched_name;
126 $match_info{'distance'} = $distance;
127 if ($synonym_uniquename_lookup{$matched_name}) {
128 $match_info{'unique_names'} = $synonym_uniquename_lookup{$matched_name};
129 } else {
130 my @unique_names_array = [$matched_name];
131 $match_info{'unique_names'} = \@unique_names_array;
133 push (@matches, \%match_info);
135 my %accession_and_fuzzy_matches;
136 $accession_and_fuzzy_matches{'name'} = $accession_name;
137 $accession_and_fuzzy_matches{'matches'} = \@matches;
138 push (@fuzzy_accessions, \%accession_and_fuzzy_matches);
142 $results{'found'} = \@found_accessions;
143 $results{'fuzzy'} = \@fuzzy_accessions;
144 $results{'absent'} = \@absent_accessions;
145 return \%results;