mediagoblin/media_types/audio/spectrogram.py

   1 # processing.py -- various audio processing functions
   2 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
   3 #                    UNIVERSITAT POMPEU FABRA
   4 #
   5 # This program is free software: you can redistribute it and/or modify
   6 # it under the terms of the GNU Affero General Public License as
   7 # published by the Free Software Foundation, either version 3 of the
   8 # License, or (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU Affero General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU Affero General Public License
  16 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 #
  18 # Authors:
  19 #   Bram de Jong <bram.dejong at domain.com where domain in gmail>
  20 #   2012, Joar Wandborg <first name at last name dot se>
  21
  22 from __future__ import print_function
  23
  24 try:
  25     from PIL import Image
  26 except ImportError:
  27     import Image
  28 import math
  29 import numpy
  30
  31 try:
  32     import scikits.audiolab as audiolab
  33 except ImportError:
  34     print("WARNING: audiolab is not installed so wav2png will not work")
  35
  36
  37 class AudioProcessingException(Exception):
  38     pass
  39
  40
  41 class SpectrogramImage(object):
  42     def __init__(self, image_size, fft_size):
  43         self.image_width, self.image_height = image_size
  44         self.fft_size = fft_size
  45
  46         colors = [
  47             (0, 0, 0, 0),
  48             (58 / 4, 68 / 4, 65 / 4, 255),
  49             (80 / 2, 100 / 2, 153 / 2, 255),
  50             (90, 180, 100, 255),
  51             (224, 224, 44, 255),
  52             (255, 60, 30, 255),
  53             (255, 255, 255, 255)
  54          ]
  55
  56         self.palette = interpolate_colors(colors)
  57
  58         # Generate lookup table for y-coordinate from fft-bin
  59         self.y_to_bin = []
  60
  61         fft_min = 100.0
  62         fft_max = 22050.0  # kHz?
  63
  64         y_min = math.log10(fft_min)
  65         y_max = math.log10(fft_max)
  66
  67         for y in range(self.image_height):
  68             freq = math.pow(
  69                     10.0,
  70                     y_min + y / (self.image_height - 1.0)
  71                     * (y_max - y_min))
  72
  73             fft_bin = freq / fft_max * (self.fft_size / 2 + 1)
  74
  75             if fft_bin < self.fft_size / 2:
  76                 alpha = fft_bin - int(fft_bin)
  77
  78                 self.y_to_bin.append((int(fft_bin), alpha * 255))
  79
  80         # this is a bit strange, but using image.load()[x,y] = ... is
  81         # a lot slower than using image.putadata and then rotating the image
  82         # so we store all the pixels in an array and then create the image when saving
  83         self.pixels = []
  84
  85     def draw_spectrum(self, x, spectrum):
  86         # for all frequencies, draw the pixels
  87         for index, alpha in self.y_to_bin:
  88             self.pixels.append(
  89                     self.palette[int((255.0 - alpha) * spectrum[index]
  90                         + alpha * spectrum[index + 1])])
  91
  92         # if the FFT is too small to fill up the image, fill with black to the top
  93         for y in range(len(self.y_to_bin), self.image_height):
  94             self.pixels.append(self.palette[0])
  95
  96     def save(self, filename, quality=90):
  97         self.image = Image.new(
  98                 'RGBA',
  99                 (self.image_height, self.image_width))
 100
 101         self.image.putdata(self.pixels)
 102         self.image.transpose(Image.ROTATE_90).save(
 103                 filename,
 104                 quality=quality)
 105
 106
 107 class AudioProcessor(object):
 108     """
 109     The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
 110     samples in that chunk of audio.
 111     """
 112     def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
 113         max_level = get_max_level(input_filename)
 114
 115         self.audio_file = audiolab.Sndfile(input_filename, 'r')
 116         self.fft_size = fft_size
 117         self.window = window_function(self.fft_size)
 118         self.spectrum_range = None
 119         self.lower = 100
 120         self.higher = 22050
 121         self.lower_log = math.log10(self.lower)
 122         self.higher_log = math.log10(self.higher)
 123         self.clip = lambda val, low, high: min(high, max(low, val))
 124
 125         # figure out what the maximum value is for an FFT doing the FFT of a DC signal
 126         fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
 127         max_fft = (numpy.abs(fft)).max()
 128
 129         # set the scale to normalized audio and normalized FFT
 130         self.scale = 1.0 / max_level / max_fft if max_level > 0 else 1
 131
 132     def read(self, start, size, resize_if_less=False):
 133         """ read size samples starting at start, if resize_if_less is True and less than size
 134         samples are read, resize the array to size and fill with zeros """
 135
 136         # number of zeros to add to start and end of the buffer
 137         add_to_start = 0
 138         add_to_end = 0
 139
 140         if start < 0:
 141             # the first FFT window starts centered around zero
 142             if size + start <= 0:
 143                 return numpy.zeros(size) if resize_if_less else numpy.array([])
 144             else:
 145                 self.audio_file.seek(0)
 146
 147                 add_to_start = - start  # remember: start is negative!
 148                 to_read = size + start
 149
 150                 if to_read > self.audio_file.nframes:
 151                     add_to_end = to_read - self.audio_file.nframes
 152                     to_read = self.audio_file.nframes
 153         else:
 154             self.audio_file.seek(start)
 155
 156             to_read = size
 157             if start + to_read >= self.audio_file.nframes:
 158                 to_read = self.audio_file.nframes - start
 159                 add_to_end = size - to_read
 160
 161         try:
 162             samples = self.audio_file.read_frames(to_read)
 163         except RuntimeError:
 164             # this can happen for wave files with broken headers...
 165             return numpy.zeros(size) if resize_if_less else numpy.zeros(2)
 166
 167         # convert to mono by selecting left channel only
 168         if self.audio_file.channels > 1:
 169             samples = samples[:,0]
 170
 171         if resize_if_less and (add_to_start > 0 or add_to_end > 0):
 172             if add_to_start > 0:
 173                 samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)
 174
 175             if add_to_end > 0:
 176                 samples = numpy.resize(samples, size)
 177                 samples[size - add_to_end:] = 0
 178
 179         return samples
 180
 181     def spectral_centroid(self, seek_point, spec_range=110.0):
 182         """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
 183
 184         samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)
 185
 186         samples *= self.window
 187         fft = numpy.fft.rfft(samples)
 188         spectrum = self.scale * numpy.abs(fft)  # normalized abs(FFT) between 0 and 1
 189
 190         length = numpy.float64(spectrum.shape[0])
 191
 192         # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
 193         db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range
 194
 195         energy = spectrum.sum()
 196         spectral_centroid = 0
 197
 198         if energy > 1e-60:
 199             # calculate the spectral centroid
 200
 201             if self.spectrum_range == None:
 202                 self.spectrum_range = numpy.arange(length)
 203
 204             spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5
 205
 206             # clip > log10 > scale between 0 and 1
 207             spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)
 208
 209         return (spectral_centroid, db_spectrum)
 210
 211
 212     def peaks(self, start_seek, end_seek):
 213         """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
 214         in that range. Returns that pair in the order they were found. So if min was found first,
 215         it returns (min, max) else the other way around. """
 216
 217         # larger blocksizes are faster but take more mem...
 218         # Aha, Watson, a clue, a tradeof!
 219         block_size = 4096
 220
 221         max_index = -1
 222         max_value = -1
 223         min_index = -1
 224         min_value = 1
 225
 226         if start_seek < 0:
 227             start_seek = 0
 228
 229         if end_seek > self.audio_file.nframes:
 230             end_seek = self.audio_file.nframes
 231
 232         if end_seek <= start_seek:
 233             samples = self.read(start_seek, 1)
 234             return (samples[0], samples[0])
 235
 236         if block_size > end_seek - start_seek:
 237             block_size = end_seek - start_seek
 238
 239         for i in range(start_seek, end_seek, block_size):
 240             samples = self.read(i, block_size)
 241
 242             local_max_index = numpy.argmax(samples)
 243             local_max_value = samples[local_max_index]
 244
 245             if local_max_value > max_value:
 246                 max_value = local_max_value
 247                 max_index = local_max_index
 248
 249             local_min_index = numpy.argmin(samples)
 250             local_min_value = samples[local_min_index]
 251
 252             if local_min_value < min_value:
 253                 min_value = local_min_value
 254                 min_index = local_min_index
 255
 256         return (min_value, max_value) if min_index < max_index else (max_value, min_value)
 257
 258
 259 def create_spectrogram_image(source_filename, output_filename,
 260         image_size, fft_size, progress_callback=None):
 261
 262     processor = AudioProcessor(source_filename, fft_size, numpy.hamming)
 263     samples_per_pixel = processor.audio_file.nframes / float(image_size[0])
 264
 265     spectrogram = SpectrogramImage(image_size, fft_size)
 266
 267     for x in range(image_size[0]):
 268         if progress_callback and x % (image_size[0] / 10) == 0:
 269             progress_callback((x * 100) / image_size[0])
 270
 271         seek_point = int(x * samples_per_pixel)
 272         next_seek_point = int((x + 1) * samples_per_pixel)
 273
 274         (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)
 275
 276         spectrogram.draw_spectrum(x, db_spectrum)
 277
 278     if progress_callback:
 279         progress_callback(100)
 280
 281     spectrogram.save(output_filename)
 282
 283
 284 def interpolate_colors(colors, flat=False, num_colors=256):
 285
 286     palette = []
 287
 288     for i in range(num_colors):
 289         # TODO: What does this do?
 290         index = (
 291                 (i *
 292                     (len(colors) - 1)  # 7
 293                 )  # 0..7..14..21..28...
 294             /
 295                 (num_colors - 1.0)  # 255.0
 296             )
 297
 298         # TODO: What is the meaning of 'alpha' in this context?
 299         alpha = index - round(index)
 300
 301         channels = list('rgb')
 302         values = dict()
 303
 304         for k, v in zip(range(len(channels)), channels):
 305             if alpha > 0:
 306                 values[v] = (
 307                         (1.0 - alpha)
 308                     *
 309                         colors[int(index)][k]
 310                     +
 311                         alpha * colors[int(index) + 1][k]
 312                     )
 313             else:
 314                 values[v] = (
 315                         (1.0 - alpha)
 316                     *
 317                         colors[int(index)][k]
 318                     )
 319
 320         if flat:
 321             palette.extend(
 322                 tuple(int(values[i]) for i in channels))
 323         else:
 324             palette.append(
 325                 tuple(int(values[i]) for i in channels))
 326
 327     return palette
 328
 329
 330 def get_max_level(filename):
 331     max_value = 0
 332     buffer_size = 4096
 333     audio_file = audiolab.Sndfile(filename, 'r')
 334     n_samples_left = audio_file.nframes
 335
 336     while n_samples_left:
 337         to_read = min(buffer_size, n_samples_left)
 338
 339         try:
 340             samples = audio_file.read_frames(to_read)
 341         except RuntimeError:
 342             # this can happen with a broken header
 343             break
 344
 345         # convert to mono by selecting left channel only
 346         if audio_file.channels > 1:
 347             samples = samples[:,0]
 348
 349         max_value = max(max_value, numpy.abs(samples).max())
 350
 351         n_samples_left -= to_read
 352
 353     audio_file.close()
 354
 355     return max_value
 356
 357 if __name__ == '__main__':
 358     import sys
 359     sys.argv[4] = int(sys.argv[4])
 360     sys.argv[3] = tuple([int(i) for i in sys.argv[3].split('x')])
 361
 362     create_spectrogram_image(*sys.argv[1:])