feature: large_microzap
[zfs.git] / scripts / update_authors.pl
blob8dd49b5fb38d631ea5a95eef7be99b9b9600125b
1 #!/usr/bin/env perl
3 # SPDX-License-Identifier: MIT
5 # Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
7 # Permission is hereby granted, free of charge, to any person obtaining a copy
8 # of this software and associated documentation files (the "Software"), to
9 # deal in the Software without restriction, including without limitation the
10 # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
11 # sell copies of the Software, and to permit persons to whom the Software is
12 # furnished to do so, subject to the following conditions:
14 # The above copyright notice and this permission notice shall be included in
15 # all copies or substantial portions of the Software.
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 # IN THE SOFTWARE.
26 # This program will update the AUTHORS file to include commit authors that are
27 # in the git history but are not yet credited.
29 # The CONTRIBUTORS section of the AUTHORS file attempts to be a list of
30 # individual contributors to OpenZFS, with one name, address and line per
31 # person. This is good for readability, but does not really leave room for the
32 # that names and emails on commits from the same individual can be different,
33 # for all kinds of reasons, not limited to:
35 # - a person might change organisations, and so their email address changes
37 # - a person might be paid to work on OpenZFS for their employer, and then hack
38 # on personal projects in the evening, so commits legitimately come from
39 # different addresses
41 # - names change for all kinds of reasons
43 # To try and account for this, this program will try to find all the possible
44 # names and emails for a single contributor, and then select the "best" one to
45 # add to the AUTHORS file.
47 # The CONTRIBUTORS section of the AUTHORS file is considered the source of
48 # truth. Once an individual committer is listed in there, that line will not be
49 # removed regardless of what is discovered in the commit history. However, it
50 # can't just be _anything_. The name or email still has to match something seen
51 # in the commit history, so that we're able to undertand that its the same
52 # contributor.
54 # The bulk of the work is in running `git log` to fetch commit author names and
55 # emails. For each value, we generate a "slug" to use as an internal id for
56 # that value, which is mostly just the lowercase of the value with whitespace
57 # and punctuation removed. Two values with subtle differences can produce the
58 # same slug, so at this point we also try to keep the "best" pre-slug value as
59 # the display version. We use this slug to update two maps, one of email->name,
60 # the other of name->email.
62 # Once collected, we then walk all the emails we've seen and get all the names
63 # associated with every instance. Then for each of those names, we get all the
64 # emails associated, and so on until we've seen all the connected names and
65 # emails. This collection is every possible name and email for an individual
66 # contributor.
68 # Finaly, we consider these groups, and select the "best" name and email for
69 # the contributor, and add them to the author tables if they aren't there
70 # already. Once we've done everyone, we write out a new AUTHORS file, and
71 # that's the whole job.
73 # This is imperfect! Its necessary for the user to examine the diff and make
74 # sure its sensible. If it hasn't hooked up right, it may necessary to adjust
75 # the input data (via .mailmap) or improve the heuristics in this program. It
76 # took a long time to get into good shape when first written (355 new names
77 # added to AUTHORS!) but hopefully in the future we'll be running this
78 # regularly so it doesn't fall so far behind.
81 use 5.010;
82 use warnings;
83 use strict;
85 # Storage for the "best looking" version of name or email, keyed on slug.
86 my %display_name;
87 my %display_email;
89 # First, we load the existing AUTHORS file. We save everything before
90 # CONTRIBUTORS: line as-is so we can write it back out to the new file. Then
91 # we extract name,email pairs from the remainder and store them in a pair of
92 # hashtables, keyed on slug.
93 my %authors_name;
94 my %authors_email;
96 my @authors_header;
98 for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) {
99 chomp $line;
100 state $in_header = 1;
101 if ($in_header) {
102 push @authors_header, $line;
103 $in_header = 0 if $line =~ m/^CONTRIBUTORS:/;
104 } else {
105 my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/;
106 next unless $name;
108 my $semail = email_slug($email);
109 my $sname = name_slug($name);
111 $authors_name{$semail} = $sname;
112 $authors_email{$sname} = $semail;
114 # The name/email in AUTHORS is already the "best looking"
115 # version, by definition.
116 $display_name{$sname} = $name;
117 $display_email{$semail} = $email;
121 # Next, we load all the commit authors. and form name<->email mappings, keyed
122 # on slug. Note that this format is getting the .mailmap-converted form. This
123 # lets us control the input to some extent by making changes there.
124 my %git_names;
125 my %git_emails;
127 for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) {
128 chomp $line;
129 my ($name, $email) = $line =~ m/^(.*):::(.*)/;
130 next unless $name && $email;
132 my $semail = email_slug($email);
133 my $sname = name_slug($name);
135 $git_names{$semail}{$sname} = 1;
136 $git_emails{$sname}{$semail} = 1;
138 # Update the "best looking" display value, but only if we don't already
139 # have something from the AUTHORS file. If we do, we must not change it.
140 if (!$authors_name{email_slug($email)}) {
141 update_display_email($email);
144 if (!$authors_email{name_slug($name)}) {
145 update_display_name($name);
149 # Now collect unique committers by all names+emails we've ever seen for them.
150 # We start with emails and resolve all possible names, then we resolve the
151 # emails for those names, and round and round until there's nothing left.
152 my @committers;
153 for my $start_email (sort keys %git_names) {
154 # it might have been deleted already through a cross-reference
155 next unless $git_names{$start_email};
157 my %emails;
158 my %names;
160 my @check_emails = ($start_email);
161 my @check_names;
162 while (@check_emails || @check_names) {
163 while (my $email = shift @check_emails) {
164 next if $emails{$email}++;
165 push @check_names,
166 sort keys %{delete $git_names{$email}};
168 while (my $name = shift @check_names) {
169 next if $names{$name}++;
170 push @check_emails,
171 sort keys %{delete $git_emails{$name}};
175 # A "committer" is the collection of connected names and emails.
176 push @committers, [[sort keys %emails], [sort keys %names]];
179 # Now we have our committers, we can work out what to add to AUTHORS.
180 for my $committer (@committers) {
181 my ($emails, $names) = @$committer;
183 # If this commiter is already in AUTHORS, we must not touch.
184 next if grep { $authors_name{$_} } @$emails;
185 next if grep { $authors_email{$_} } @$names;
187 # Decide on the "best" name and email to use
188 my $email = best_email(@$emails);
189 my $name = best_name(@$names);
191 $authors_email{$name} = $email;
192 $authors_name{$email} = $name;
195 # Now output the new AUTHORS file
196 open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n";
197 #my $fh = \*STDOUT;
198 say $fh join("\n", @authors_header, "");
199 for my $name (sort keys %authors_email) {
200 my $cname = $display_name{$name};
201 my $cemail = $display_email{email_slug($authors_email{$name})};
202 say $fh " $cname <$cemail>";
205 exit 0;
207 # "Slugs" are used at the hashtable key for names and emails. They are used to
208 # making two variants of a value be the "same" for matching. Mostly this is
209 # to make upper and lower-case versions of a name or email compare the same,
210 # but we do a little bit of munging to handle some common cases.
212 # Note that these are only used for matching internally; for display, the
213 # slug will be used to look up the display form.
214 sub name_slug {
215 my ($name) = @_;
217 # Remove spaces and dots, to handle differences in initials.
218 $name =~ s/[\s\.]//g;
220 return lc $name;
222 sub email_slug {
223 my ($email) = @_;
225 # Remove everything up to and including the first space, and the last
226 # space and everything after it.
227 $email =~ s/^(.*\s+)|(\s+.*)$//g;
229 # Remove the leading userid+ on Github noreply addresses. They're
230 # optional and we want to treat them as the same thing.
231 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
233 return lc $email;
236 sub update_display_name {
237 my ($name) = @_;
238 my $sname = name_slug($name);
240 # For names, "more specific" means "has more non-lower-case characters"
241 # (in ASCII), guessing that if a person has gone to some effort to
242 # specialise their name in a later commit, they presumably care more
243 # about it. If this is wrong, its probably better to add a .mailmap
244 # entry.
246 my $cname = $display_name{$sname};
247 if (!$cname ||
248 ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) {
249 $display_name{$sname} = $name;
252 sub update_display_email {
253 my ($email) = @_;
254 my $semail = email_slug($email);
256 # Like names, we prefer uppercase when possible. We also remove any
257 # leading "plus address" for Github noreply addresses.
258 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
260 my $cemail = $display_email{$semail};
261 if (!$cemail ||
262 ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) {
263 $display_email{$semail} = $email;
267 sub best_name {
268 my @names = sort {
269 my $cmp;
270 my ($aa) = $display_name{$a};
271 my ($bb) = $display_name{$b};
273 # The "best" name is very subjective, and a simple sort
274 # produced good-enough results, so I didn't try harder. Use of
275 # accented characters, punctuation and caps are probably an
276 # indicator of "better", but possibly we should also take into
277 # account the most recent name we saw, in case the committer
278 # has changed their name or nickname or similar.
280 # Really, .mailmap is the place to control this.
282 return ($aa cmp $bb);
283 } @_;
285 return shift @names;
287 sub best_email {
288 state $internal_re = qr/\.(?:internal|local|\(none\))$/;
289 state $noreply_re = qr/\.noreply\.github\.com$/;
290 state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/;
292 my @emails = sort {
293 my $cmp;
295 # prefer address with a single @ over those without
296 $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1);
297 return $cmp unless $cmp == 0;
299 # prefer any address over internal/local addresses
300 $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re));
301 return $cmp unless $cmp == 0;
303 # prefer any address over github noreply aliases
304 $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re));
305 return $cmp unless $cmp == 0;
307 # prefer any address over freemail providers
308 $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re));
309 return $cmp unless $cmp == 0;
311 # alphabetical by domain
312 my ($alocal, $adom) = split /\@/, $a;
313 my ($blocal, $bdom) = split /\@/, $b;
314 $cmp = ($adom cmp $bdom);
315 return $cmp unless $cmp == 0;
317 # alphabetical by local part
318 return ($alocal cmp $blocal);
319 } @_;
321 return shift @emails;