src/audiodup

   1 #!/bin/bash
   2
   3 ALGO=md5
   4 PERCENT=2
   5 PATTERN='*.mp3'
   6 while getopts ":a:b:p:h" OPT; do
   7         case $OPT in
   8         a)
   9                 ALGO=$OPTARG;
  10                 ;;
  11         b)
  12                 PERCENT=$OPTARG;
  13                 ;;
  14         p)
  15                 PATTERN=$OPTARG;
  16                 ;;
  17         h)
  18                 echo "
  19 This script finds duplicate mp3 files by audio content (it ignores tags).
  20
  21 usage: $0 [-a algorithm] [-b percent] [-p pattern] [path] [path...]
  22
  23   -a algorithm   Chooses the hash algorithm to use. See audiosum -l.
  24   -b percent     Integer >=1 and <=99, chooses the amount of the file data to
  25                  read during the intermediate phase of processing.
  26   -p pattern     File pattern to match against.
  27
  28 This script works by processing the mp3 files according to audiosum -h. This
  29 is made by three parts:
  30
  31 1. Discards files which have different sizes.
  32 2. Discards files with same size but different hash for the first n% data.
  33 3. Discards files with same size but different hash for the whole data.
  34
  35 The remaining files are duplicated files.
  36 "
  37         exit
  38         ;;
  39         esac
  40 done
  41
  42 HASH_LEN=`audiosum -l | grep ": .$ALGO" | ( read COLON NAME LEN; echo $((LEN/4)) )`
  43 if [ -z "$HASH_LEN" ]; then
  44         echo Algorithm not supported.
  45         exit 1;
  46 fi
  47
  48 if [[ $HASH_LEN = *[^0-9]* ]]; then
  49         echo Algorithm not supported.
  50         exit 1;
  51 fi
  52
  53 if [[ $PERCENT = *[^0-9]* ]]; then
  54         echo Percent value must be an integer between 1 and 99.
  55         exit 1;
  56 fi
  57
  58 if [ $PERCENT -lt 1 ] || [ $PERCENT -gt 99 ]; then
  59         echo Percent value must be an integer between 1 and 99.
  60         exit 1;
  61 fi
  62
  63 shift $((OPTIND-1))
  64
  65 find -- $* -type f -iname "$PATTERN" | \
  66     audiosum -b | sort | uniq -D -w 8 | cut -d ' ' -f 6- | \
  67     audiosum -b $PERCENT | sort | uniq -D -w $((HASH_LEN+9)) | cut -d ' ' -f 7- | \
  68     audiosum | sort | uniq --all-repeated=separate -w $((HASH_LEN+9))