3 # This script takes as an input the output of a command like
5 # find /data -type f -exec md5sum {} \;
7 # and generates a script to remove duplicates, optionally creating symlinks.
12 use Ogg
::Vorbis
::Header
;
14 use String
::ShellQuote
;
20 #The master patterns are a prioritised list that indicates where the master copy of a file should be kept.
22 @findMasterPatterns = ();
23 push @findMasterPatterns, '^/data/public/Music/Production/Kevin/';
24 push @findMasterPatterns, '^/data/public/Music/Production/Andrew/';
25 push @findMasterPatterns, '^/data/public/Music/Production/Grouped/';
26 push @findMasterPatterns, '^/data/public/Photos/Structured/';
27 push @findMasterPatterns, '^/data/public/Christopher/';
28 push @findMasterPatterns, '^/data/public/Music/nietzche/';
29 push @findMasterPatterns, '^/data/public/Music/Production/Karl/';
30 push @findMasterPatterns, '^/data/public/Photos/Lost-Dates/';
31 push @findMasterPatterns, '^/data/public/Music/Danny_Todd/';
32 push @findMasterPatterns, '^/data/public/Music/Production/Daniel/';
33 push @findMasterPatterns, '^/data/public/Music/Production/Simon/Pop/';
34 push @findMasterPatterns, '^/data/public/movies/TV Episodes/';
35 push @findMasterPatterns, '^/data/public/Music/Production/01 Singles .Graydon./';
36 push @findMasterPatterns, '^/data/public/Music/Production/01 Singles .Peter./';
37 push @findMasterPatterns, '^/data/public/Erica/fromwork/Thesis/Histories of the discipline/';
38 push @findMasterPatterns, '^/data/public/Erica/fromwork/Thesis/Historis of the University in NZ/';
39 #push @findMasterPatterns, '';
40 #push @findMasterPatterns, '';
42 #The link patterns are a specification of which directories should have links generated - i.e. the original file
43 #names might be referenced. . should always create links.
46 push @linkPatterns, '^/data/public/Music/Production/01 Singles .Erica./';
47 push @linkPatterns, '^/data/public/Christopher/';
49 #print "@findMasterPatterns findMasterPatterns loaded.\n";
50 #print "@linkPatterns linkPatterns loaded.\n";
52 open(SCRIPT
,"> script.sh") || die("can't open script.sh: $!");
53 print SCRIPT
"#!/bin/sh\n";
54 open(MD5s
,'md5sorted.txt') || die("can't open md5sorted.txt: $!");
62 my @dataStore = undef;
68 $dataStore[$numCount]="/" . $fileName;
69 if ($leftOver) { print "UNEXPECTED SPLIT VALUE FOR HASH $newMD5 RESIDUAL $leftOver\n" }
74 $totalFiles += $numCount;
77 $dataStore[0] = $newMD5;
84 ($newMD5,$fileName,$leftOver) = split m! /!;
85 if ($lastMD5 eq $newMD5 || $firstOne)
87 #just load into the main data hash
91 elsif ($numCount == 1)
93 #It's different, but there was only one of them
94 #Because we're processing duplicates, no action needed
95 print SCRIPT
"#IGNORE $dataStore[1]\n";
99 #actually do the work - this is split out as the input file will end, and we still want to process in that case
100 #note the last action is still to read resetStore
105 if ($numCount > 1) { nextAndLast
(); }
107 print "In this number of files: $totalFiles\n";
108 print "Linked: $totalLinked\n";
109 print "Removed: $totalRemoved\n";
111 printf "Tidied percentage %2.1f\n", ($totalLinked + $totalRemoved) / $totalFiles * 100;
116 #process the records associated with the previous md5sum, which are in @dataStore[1 to $numCount]
118 #find the one to keep
123 until ($oneToKeep or $patternID == @findMasterPatterns) {
126 until ($oneToKeep or $i == $numCount) {
128 if ($dataStore[$i] =~ $findMasterPatterns[$patternID-1]) {$oneToKeep = $i;} ;
132 if (@dataStore) { #if not, then first run
133 unless ($oneToKeep) {
134 die "Could not find one to keep for @dataStore"
138 #symlink or delete the others based on directory
140 for ($i=1;$i<$numCount+1;$i++)
142 if ($i == $oneToKeep) #if this is the one to keep
144 print SCRIPT
"#RETAIN $dataStore[$i]\n";
150 until ($done or $j==@linkPatterns)
153 if ($dataStore[$i] =~ $linkPatterns[$j-1]) {
154 @temp = shell_quote
("rm", "$dataStore[$i]");
155 @temp2 = shell_quote
("ln", "-s", "$dataStore[$oneToKeep]", "$dataStore[$i]") ;
156 print SCRIPT
"@temp && @temp2\n";
163 @temp = shell_quote
("rm", "$dataStore[$i]");
164 print SCRIPT
"@temp\n";
170 # And reset the data store etc.