1 package CXGN
::BreedersToolbox
::AccessionsFuzzySearch
;
5 CXGN::BreedersToolbox::AccessionsFuzzySearch - an object to find approximate matches in the database to a query accession name.
9 my $fuzzy_accession_search = CXGN::BreedersToolbox::AccessionsFuzzySearch->new({schema => $schema});
10 my $fuzzy_search_result = $fuzzy_accession_search->get_matches(\@accession_list, $max_distance)};
17 Jeremy D. Edwards (jde22@cornell.edu)
24 use MooseX
::FollowPBP
;
25 use Moose
::Util
::TypeConstraints
;
26 use CXGN
::String
::FuzzyMatch
;
27 use SGN
::Model
::Cvterm
;
32 isa
=> 'DBIx::Class::Schema',
39 my $accession_list_ref = shift;
40 my $max_distance = shift;
41 my $schema = $self->get_schema();
42 my @accession_list = @
{$accession_list_ref};
43 my %synonym_uniquename_lookup;
44 my $fuzzy_string_search = CXGN
::String
::FuzzyMatch
->new( { case_insensitive
=> 0} );
46 my @absent_accessions;
50 my $synonym_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
51 my $accession_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
52 my $q = "SELECT stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$accession_type_id";
53 my $h = $schema->storage->dbh()->prepare($q);
56 while (my ($uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
57 $uniquename_hash{$uniquename} = 1;
59 if ($type_id == $synonym_type_id){
60 push @
{$synonym_uniquename_lookup{$synonym}}, $uniquename;
65 my @stock_names = keys %uniquename_hash;
66 my @synonym_names = keys %synonym_uniquename_lookup;
67 push (@stock_names, @synonym_names);
68 my %stock_names_hash = map {$_ => 1} @stock_names;
70 foreach my $accession_name (@accession_list) {
71 if (exists($stock_names_hash{$accession_name})){
72 push @found_accessions, {"matched_string" => $accession_name, "unique_name" => $accession_name};
76 my @accession_matches = @
{$fuzzy_string_search->get_matches($accession_name, \
@stock_names, $max_distance)};
77 my $more_than_one_perfect_match = 0;
78 my $more_than_one_unique_name_for_synonym = 0;
79 my $has_one_unique_match = 0;
81 if (scalar @accession_matches eq 0) {
82 push (@absent_accessions, $accession_name);
84 my $matched_string = $accession_matches[0]->{'string'};
85 my $synonym_lookup_of_matched_string = $synonym_uniquename_lookup{$matched_string};
87 #Make sure that there isn't more than one perfect match
88 if ($accession_matches[1]) {
89 my $next_matched_string = $accession_matches[1]->{'string'};
90 if ($next_matched_string eq $accession_name) {
91 $more_than_one_perfect_match = 1;
95 #Make sure that there isn't more than one unique name for the searched string if synonym
96 if ($synonym_lookup_of_matched_string) {
97 if (scalar @
{$synonym_lookup_of_matched_string} > 1) {
98 $more_than_one_unique_name_for_synonym = 1;
102 #Store accession name to found list if there is one unique match
103 if ( $matched_string eq $accession_name && !$more_than_one_perfect_match && !$more_than_one_unique_name_for_synonym) {
104 my %found_accession_and_uniquename;
105 $found_accession_and_uniquename{'matched_string'} = $accession_name;
107 #when there is a synonym, store the unique name and the searched string
108 if ($synonym_lookup_of_matched_string) {
109 my @unique_names_of_synonym;
110 @unique_names_of_synonym = @
{$synonym_lookup_of_matched_string};
111 #should not be more than one unique name for synonym because checked array length earlier
112 $found_accession_and_uniquename{'unique_name'} = $unique_names_of_synonym[0];
114 $found_accession_and_uniquename{'unique_name'} = $accession_name;
116 push (@found_accessions, \
%found_accession_and_uniquename);
117 $has_one_unique_match = 1;
120 if (!$has_one_unique_match) {
121 foreach my $match (@accession_matches) {
122 my $matched_name = $match->{'string'};
123 my $distance = $match->{'distance'};
125 $match_info{'name'} = $matched_name;
126 $match_info{'distance'} = $distance;
127 if ($synonym_uniquename_lookup{$matched_name}) {
128 $match_info{'unique_names'} = $synonym_uniquename_lookup{$matched_name};
130 my @unique_names_array = [$matched_name];
131 $match_info{'unique_names'} = \
@unique_names_array;
133 push (@matches, \
%match_info);
135 my %accession_and_fuzzy_matches;
136 $accession_and_fuzzy_matches{'name'} = $accession_name;
137 $accession_and_fuzzy_matches{'matches'} = \
@matches;
138 push (@fuzzy_accessions, \
%accession_and_fuzzy_matches);
142 $results{'found'} = \
@found_accessions;
143 $results{'fuzzy'} = \
@fuzzy_accessions;
144 $results{'absent'} = \
@absent_accessions;