I'm implementing FFT pitch detection on the iPhone using Apple's Accelerate framework as discussed many times here before.
I understand phase offsets, bin frequencies, and have researched several open-source tuners that use FFT techniques (simple pitch detection, autocorrelation, cepstrum, and the like) to detect pitch. Here's my problem:
My FFT results are consistently off by 5-10 Hz (+/-), even when the bins are only 1-2 hertz apart. I've tried different algorithms, and even a simple FFT sampled at high resolution shows the magnitude spikes in seemingly the wrong places. It's not a consistent offset; some are too high, some too low.
For instance, a 440Hz tone comes across as 445.2 Hz; a 220Hz as 214Hz; an 880Hz as 874Hz; 1174Hz as 1183Hz using a tone generator. A similar open-source tuner for Mac using almost exactly the same algorithms has no problem detecting the pitch perfectly. (These differences are different when on the device than the simulator, but they are still off.)
I don't think the problem is bin resolution, because there are often a few bins between the actual tone and the detected magnitude spike. It's as though the input is just hearing the wrong pitch.
I've pasted my code below. The general flow is simple:
Push a step onto the FFT buffer -> Hann Window -> FFT -> Phase/Magnitude -> Max pitch is wrong.
enum {
kOversample = 4,
kSamples = MAX_FRAME_LENGTH,
kSamples2 = kSamples / 2,
kRange = kSamples * 5 / 16,
kStep = kSamples / kOversample
};
const int PENDING_LEN = kSamples * 5;
static float pendingAudio[PENDING_LEN * sizeof(float)];
static int pendingAudioLength = 0;
- (void)processBuffer {
static float window[kSamples];
static float phase[kRange];
static float lastPhase[kRange];
static float phaseDeltas[kRange];
static float frequencies[kRange];
static float slidingFFTBuffer[kSamples];
static float buffer[kSamples];
static BOOL initialized = NO;
if (!initialized) {
memset(lastPhase, 0, kRange * sizeof(float));
vDSP_hann_window(window, kSamples, 0);
initialized = YES;
}
BOOL canProcessNewStep = YES;
while (canProcessNewStep) {
@synchronized (self) {
if (pendingAudioLength < kStep) {
break; // not enough data
}
// Rotate one step's worth of pendingAudio onto the end of slidingFFTBuffer
memmove(slidingFFTBuffer, slidingFFTBuffer + kStep, (kSamples - kStep) * sizeof(float));
memmove(slidingFFTBuffer + (kSamples - kStep), pendingAudio, kStep * sizeof(float));
memmove(pendingAudio, pendingAudio + kStep, (PENDING_LEN - kStep) * sizeof(float));
pendingAudioLength -= kStep;
canProcessNewStep = (pendingAudioLength >= kStep);
}
// Hann Windowing
vDSP_vmul(slidingFFTBuffer, 1, window, 1, buffer, 1, kSamples);
vDSP_ctoz((COMPLEX *)buffer, 2, &splitComplex, 1, kSamples2);
// Carry out a Forward FFT transform.
vDSP_fft_zrip(fftSetup, &splitComplex, 1, log2f(kSamples), FFT_FORWARD);
// magnitude to decibels
static float magnitudes[kRange];
vDSP_zvmags(&splitComplex, 1, magnitudes, 1, kRange);
float zero = 1.0;
vDSP_vdbcon(magnitudes, 1, &zero, magnitudes, 1, kRange, 0); // to decibels
// phase
vDSP_zvphas(&splitComplex, 1, phase, 1, kRange); // compute magnitude and phase
vDSP_vsub(lastPhase, 1, phase, 1, phaseDeltas, 1, kRange); // compute phase difference
memcpy(lastPhase, phase, kRange * sizeof(float)); // save old phase
double freqPerBin = sampleRate / (double)kSamples;
double phaseStep = 2.0 * M_PI * (float)kStep / (float)kSamples;
// process phase difference ( via https://stackguides.com/questions/4633203 )
for (int k = 1; k < kRange; k++) {
double delta = phaseDeltas[k];
delta -= k * phaseStep; // subtract expected phase difference
delta = remainder(delta, 2.0 * M_PI); // map delta phase into +/- M_PI interval
delta /= phaseStep; // calculate diff from bin center frequency
frequencies[k] = (k + delta) * freqPerBin; // calculate the true frequency
}
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
MCTunerData *tunerData = [[[MCTunerData alloc] initWithSize:MAX_FRAME_LENGTH] autorelease];
double maxMag = -INFINITY;
float maxFreq = 0;
for (int i=0; i < kRange; i++) {
[tunerData addFrequency:frequencies[i] withMagnitude:magnitudes[i]];
if (magnitudes[i] > maxMag) {
maxFreq = frequencies[i];
maxMag = magnitudes[i];
}
}
NSLog(@"Max Frequency: %.1f", maxFreq);
[tunerData calculate];
// Update the UI with our newly acquired frequency value.
[self.delegate frequencyChangedWithValue:[tunerData mainFrequency] data:tunerData];
[pool drain];
}
}
OSStatus renderCallback(void *inRefCon, AudioUnitRenderActionFlags *ioActionFlags,
const AudioTimeStamp *inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames,
AudioBufferList *ioData)
{
MCTuner* tuner = (MCTuner *)inRefCon;
OSStatus err = AudioUnitRender(tuner->audioUnit, ioActionFlags, inTimeStamp, 1, inNumberFrames, tuner->bufferList);
if (err < 0) {
return err;
}
// convert SInt16 to float because iOS doesn't support recording floats directly
SInt16 *inputInts = (SInt16 *)tuner->bufferList->mBuffers[0].mData;
@synchronized (tuner) {
if (pendingAudioLength + inNumberFrames < PENDING_LEN) {
// Append the audio that just came in into the pending audio buffer, converting to float
// because iOS doesn't support recording floats directly
for(int i = 0; i < inNumberFrames; i++) {
pendingAudio[pendingAudioLength + i] = (inputInts[i] + 0.5) / 32767.5;
}
pendingAudioLength += inNumberFrames;
} else {
// the buffer got too far behind. Don't give any more audio data.
NSLog(@"Dropping frames...");
}
if (pendingAudioLength >= kStep) {
[tuner performSelectorOnMainThread:@selector(processBuffer) withObject:nil waitUntilDone:NO];
}
}
return noErr;
}