Announce

PukiWiki contents have been moved into SONOTS Plugin (20070703)

Project: Pitch Detection

Table of Contents

Introduction

The pitch determination is very important for many speech processing algorithms. In this project, pitch detection methods via autocorrelation method, cepstrum method, harmonic product specturm (HPS), and linear predictive coding (LPC) are examined.

The report is available at filereport
Download matlab codes and data

PitchFrameLPC.png

First Edition: April 2008. Last Modified: April 2008
Tag: Scientific SoundProcessing Pitch Formant Matlab

Pitch Detection via Cepstral Method

Cepstrum

% NAME
%   spCepstrum - The cepstrum
% SYNOPSIS
%   [c, y] = spCepstrum(x, fs, window, show)
% DESCRIPTION
%   Obtain cepstrum coefficients of a signal
% INPUTS
%   x        (vector) of size Nx1 which contains signal
%   fs       (scalar) the sampling frequency
%   [window] (string) the window function such as 'rectwin', 'hamming'. 
%             The default is 'rectwin' (or no window). 
%   [show]   (bool)   plot or not. The default is 0.
% OUTPUTS
%   c        (vector) of size Nx1 which contains cepstrum coefficients
%   [y]      (vector) of size Nx1 which contains fourier response
% AUTHOR
%   Naotoshi Seo, April 2008
function [c, y] = spCepstrum(x, fs, window, show)
 %% Initialization
 N = length(x);
 x = x(:); % assure column vector
 if ~exist('show', 'var') || isempty(show)
     show = 0;
 end
 if ~exist('window', 'var') || isempty(window)
     window = 'rectwin';
 end
 if ischar(window);
     window = eval(sprintf('%s(N)', window)); % hamming(N)
 end

 %% do fourier transform of a windowed signal
 x = x(:) .* window(:);
 y = fft(x, N);

 %% Cepstrum is IDFT (or DFT) of log spectrum
 c = ifft(log(abs(y)+eps));

 if show
     ms1=fs/1000; % 1ms. maximum speech Fx at 1000Hz
     ms20=fs/50;  % 20ms. minimum speech Fx at 50Hz

     %% plot waveform
     t=(0:N-1)/fs;        % times of sampling instants
     subplot(2,1,1);
     plot(t,x);
     legend('Waveform');
     xlabel('Time (s)');
     ylabel('Amplitude');

     %% plot cepstrum between 1ms (=1000Hz) and 20ms (=50Hz)
     %% DC component c(0) is too large
     q=(ms1:ms20)/fs;
     subplot(2,1,2);
     plot(q,abs(c(ms1:ms20)));
     legend('Cepstrum');
     xlabel('Quefrency (s) 1ms (1000Hz) to 20ms (50Hz)');
     ylabel('Amplitude');
 end
end
function spCepstrumDemo 
 [x, fs] = wavread('bee.wav'); x = x(1000:1480);
 % x = wgn(1, 1000, 2); fs = 16000;
 c = spCepstrum(x, fs, 'hamming', 'plot');
end

Pitch Detection

% NAME
%   spPitchCepstrum - Pitch Estimation via Cepstral Method
% SYNOPSIS
%   [f0] = spPitchCepstrum(c, fs)
% DESCRIPTION
%   Estimate pitch frequencies via Cepstral method
% INPUTS
%   c        (vector) of size Nx1 which contains cepstrum coefficients. 
%             Use spCepstrum.m
%   fs       (scalar) the sampling frequency of the original signal
% OUTPUTS
%   f0       (scalar) the estimated pitch
% AUTHOR
%   Naotoshi Seo, April 2008
% SEE ALSO
%   spCepstrum.m
function [f0] = spPitchCepstrum(c, fs)
 % search for maximum  between 2ms (=500Hz) and 20ms (=50Hz)
 ms2=floor(fs*0.002); % 2ms
 ms20=floor(fs*0.02); % 20ms
 [maxi,idx]=max(abs(c(ms2:ms20)));
 f0 = fs/(ms2+idx-1);
end
function spPitchCepstrumDemo 
 [x, fs] = wavread('bee.wav'); x = x(1000:1480);
 % x = wgn(1, 1000, 2); fs = 16000;
 c = spCepstrum(x, fs, 'hamming', 'plot');
 f0 = spPitchCepstrum(c, fs)

Pitch Tracking in Short-Time Frames

% NAME
%   spPitchTrackCepstrum: Pitch Tracking via the Cepstral Method 
% SYNOPSIS
%   [F0, T, C] = 
%     spPitchTrackCepstrum(x, fs, frame_length, frame_overlap, window, show)
% DESCRIPTION
%   Track F0 formants in time bins
% INPUTS
%   x               (vector) of size Nx1.
%   fs              (scalar) the sampling rate in Hz. 
%   [frame_length]  (scalar) the length of each frame in micro second. 
%                    The default is 30ms.  
%   [frame_overlap] (scalar) the length of each frame overlaps in micro second. 
%                    The default is frame_length / 2. 
%   [window]        (string) the window function such as rectwin, hamming.  
%                    if not specified, equivalent to hamming
%   [show]          (bool)   plot or not. The default is 0.
% OUTPUTS
%   F0              (vector) of size 1xK which contains the fundamental 
%                    frequencies at each frame where K is the number of frames. 
%   T               (vector) of size 1xK whose value corresponds to the
%                    time center of each frame in second
%   [C]             (matrix) of size MxK which contains cepstrogram
% AUTHOR
%   Naotoshi Seo, April 2006
function [F0, T, C] = spPitchTrackCepstrum(x, fs, frame_length, frame_overlap, window, show)
 %% Initialization
 N = length(x);
 if ~exist('frame_length', 'var') || isempty(frame_length)
     frame_length = 30;
 end
 if ~exist('frame_overlap', 'var') || isempty(frame_overlap)
     frame_overlap = 20;
 end
 if ~exist('window', 'var') || isempty(window)
     window = 'hamming';
 end
 if ~exist('show', 'var') || isempty(show)
     show = 0;
 end
 nsample = round(frame_length  * fs / 1000); % convert ms to points
 noverlap = round(frame_overlap * fs / 1000); % convert ms to points
 if ischar(window)
     window   = eval(sprintf('%s(nsample)', window)); % e.g., hamming(nfft)
 end

 %% Pitch detection for each frame
 pos = 1; i = 1;
 while (pos+nsample < N)
     frame = x(pos:pos+nsample-1);
     C(:,i) = spCepstrum(frame, fs, window);
     F0(i) = spPitchCepstrum(C(:,i), fs);
     pos = pos + (nsample - noverlap);
     i = i + 1;
 end
 T = (round(nsample/2):(nsample-noverlap):N-1-round(nsample/2))/fs;

if show 
    % plot waveform
    subplot(2,1,1);
    t = (0:N-1)/fs;
    plot(t, x);
    legend('Waveform');
    xlabel('Time (s)');
    ylabel('Amplitude');
    xlim([t(1) t(end)]);

    % plot F0 track
    subplot(2,1,2);
    plot(T,F0);
    legend('pitch track');
    xlabel('Time (s)');
    ylabel('Frequency (Hz)');
    xlim([t(1) t(end)]);
end
[x, fs] = wavread('bee.wav');
[F0, T, C] = spPitchTrackCepstrum(x, fs, 30, 20, 'hamming', 'plot');

Pitch Detection via Auto-correlation Method

Auto-correlation

plot

% NAME
%   spCorr - Auto-correlation of a signal (Correlogram)
% SYNOPSIS
%   [r] = spCorr(x, fs, maxlag, show)
% DESCRIPTION
%   Obtain Auto-correlation coefficients of a signal
% INPUTS
%   x        (vector) of size Nx1 which contains signal
%   fs       (scalar) the sampling frequency
%   [maxlag] (scalar) seek the correlation sequence over the lag range 
%             [-maxlag:maxlag]. Output r has length 2*maxlag+1.
%             The default is 20ms lag, that is, 50Hz (the minimum possible
%             F0 frequency of human speech)
% OUTPUTS
%   r        (vector) of size 2*maxlag+1 which contains 
%             correlation coefficients
% AUTHOR
%   Naotoshi Seo, April 2008
% USES
%   xcorr.m (Signal Processing toolbox)
function [r] = spCorrelum(x, fs, maxlag, show)
 %% Initialization
 if ~exist('maxlag', 'var') || isempty(maxlag)
     maxlag = fs/50; % F0 is greater than 50Hz => 20ms maxlag
 end
 if ~exist('show', 'var') || isempty(show)
     show = 0;
 end

 %% Auto-correlation
 r = xcorr(x, maxlag, 'coeff');

 if show
     %% plot waveform
     t=(0:length(x)-1)/fs;        % times of sampling instants
     subplot(2,1,1);
     plot(t,x);
     legend('Waveform');
     xlabel('Time (s)');
     ylabel('Amplitude');
     xlim([t(1) t(end)]);

     %% plot autocorrelation
     d=(-maxlag:maxlag)/fs;
     subplot(2,1,2);
     plot(d,r);
     legend('Auto-correlation');
     xlabel('Lag (s)');
     ylabel('Correlation coef');
 end
end
[x, fs] = wavread('bee.wav'); x = x(1000:1480);
r = spCorr(x, fs, [], 'plot');

Pitch Detection

% NAME
%   spPitchCorr - Pitch Estimation via Auto-correlation Method
% SYNOPSIS
%   [f0] = spPitchCorr(r, fs)
% DESCRIPTION
%   Estimate pitch frequencies via Cepstral method
% INPUTS
%   r        (vector) of size (maxlag*2+1)x1 which contains Corr coefficients. 
%             Use spCorr.m
%   fs       (scalar) the sampling frequency of the original signal
% OUTPUTS
%   f0       (scalar) the estimated pitch
% AUTHOR
%   Naotoshi Seo, April 2008
% SEE ALSO
%   spCorr.m
function [f0] = spPitchCorr(r, fs)
 % search for maximum  between 2ms (=500Hz) and 20ms (=50Hz)
 ms2=floor(fs/500); % 2ms
 ms20=floor(fs/50); % 20ms
 % half is just mirror for real signal
 r = r(floor(length(r)/2):end);
 [maxi,idx]=max(r(ms2:ms20));
 f0 = fs/(ms2+idx-1);
end
[x, fs] = wavread('bee.wav'); x = x(1000:1480);
r = spCorr(x, fs, [], 'plot');
f0 = spPitchCorr(r, fs)

Pitch Tracking in Short-Time Frames

% NAME
%   spPitchTrackCorr: Pitch Tracking via the Auto-correlation Method 
% SYNOPSIS
%   [F0, T, R] = 
%     spPitchTrackCorr(x, fs, frame_length, frame_overlap, maxlag, show)
% DESCRIPTION
%   Track F0 formants in time bins
% INPUTS
%   x               (vector) of size Nx1.
%   fs              (scalar) the sampling rate in Hz. 
%   [frame_length]  (scalar) the length of each frame in micro second. 
%                    The default is 30ms.  
%   [frame_overlap] (scalar) the length of each frame overlaps in micro second. 
%                    The default is frame_length / 2. 
%   [maxlag]        (scalar) seek the correlation sequence over the lag range 
%                    [-maxlag:maxlag]. The default is 20ms lag, that is, 
%                    50Hz (the minimum possible F0 frequency of human
%                    speech). 20ms lag must be enough. 
%   [show]          (bool)   plot or not. The default is 0.
% OUTPUTS
%   F0              (vector) of size 1xK which contains the fundamental 
%                    frequencies at each frame where K is the number of frames. 
%   T               (vector) of size 1xK whose value corresponds to the
%                    time center of each frame in second
%   [R]             (matrix) of size MxK which contains correlogram
% USES
%   spPitchCorr.m, spCorr.m
% AUTHOR
%   Naotoshi Seo, April 2006
function [F0, T, R] = spPitchTrackCorr(x, fs, frame_length, frame_overlap, maxlag, show)
 %% Initialization
 N = length(x);
 if ~exist('frame_length', 'var') || isempty(frame_length)
     frame_length = 30;
 end
 if ~exist('frame_overlap', 'var') || isempty(frame_overlap)
     frame_overlap = 20;
 end
 if ~exist('maxlag', 'var')
     maxlag = [];
 end
 if ~exist('show', 'var') || isempty(show)
     show = 0;
 end 
 nsample = round(frame_length  * fs / 1000); % convert ms to points
 noverlap = round(frame_overlap * fs / 1000); % convert ms to points

 %% Pitch detection for each frame
 pos = 1; i = 1;
 while (pos+nsample < N)
     frame = x(pos:pos+nsample-1);
     frame = frame - mean(frame); % mean subtraction
     R(:,i) = spCorr(frame, fs);
     F0(i) = spPitchCorr(R(:,i), fs);
     pos = pos + (nsample - noverlap);
     i = i + 1;
 end
 T = (round(nsample/2):(nsample-noverlap):N-1-round(nsample/2))/fs;

if show 
    % plot waveform
    subplot(2,1,1);
    t = (0:N-1)/fs;
    plot(t, x);
    legend('Waveform');
    xlabel('Time (s)');
    ylabel('Amplitude');
    xlim([t(1) t(end)]);

    % plot F0 track
    subplot(2,1,2);
    plot(T,F0);
    legend('pitch track');
    xlabel('Time (s)');
    ylabel('Frequency (Hz)');
    xlim([t(1) t(end)]);
end
function spPitchTrackCorrDemo
[x, fs] = wavread('bee.wav');
[F0, T, R] = spPitchTrackCorr(x, fs, 30, 20, [], 'plot');
end

Formants Detection via LPC Method

LPC

% NAME
%   spLpc - The linear predictive coding (one-step finite observation 
%           wiener filter prediction)
% SYNOPSIS
%   [a P e] = spLpc(x, fs, ncoef, show)
% DESCRIPTION
%   Obtain LPC coefficients (AR model)
% INPUTS
%   x        (vector) of size Nx1 which contains signal
%   fs       (scalar) the sampling frequency
%   [ncoef]  (scalar) the number of coefficients. The default uses
%              ncoef = 2 + fs / 1000;
%             as a rule of thumb. 
% OUTPUTS
%   a        (vector) of size ncoefx1 which contains LPC coefficients
%   P        (scalar) variance (power) of the prediction error
%   e        (vector) of size Nx1 which contains residual error signals
% AUTHOR
%   Naotoshi Seo, April 2008
% USES
%   lpc.m (Signal Processing toolbox)
function [a P e] = spLpc(x, fs, ncoef)
 if ~exist('ncoef', 'var') || isempty(ncoef)
     ncoef = 2 + round(fs / 1000); % rule of thumb for human speech
 end
 [a P] = lpc(x, ncoef);
 if nargout > 2,
    est_x = filter([0 -a(2:end)],1,x);    % Estimated signal
    e = x - est_x;                        % Residual signal
 end 
end
[x, fs] = wavread('bee.wav');
a = spLpc(x, fs, [], 'plot');

Formants Detection

% NAME
%   spFormantsLpc - Formants Estimation via LPC Method
% SYNOPSIS
%   [F] = spFormantsLpc(a, fs)
% DESCRIPTION
%   Estimate formants frequencies
% INPUTS
%   a        (vector) of size ncoefx1 which contains the LPC coefficients
%             of the original signal. Use spLpc.m
%   fs       (scalar) the sampling frequency of the original signal
% OUTPUTS
%   F        (vector) of size ncoefx1 which contains formants
% AUTHOR
%   Naotoshi Seo, April 2008
% SEE ALSO
%   spLpc.m
function [F] = spFormantsLpc(a, fs)
 r = roots(a);
 r = r(imag(r)>0.01);
 F = sort(atan2(imag(r),real(r))*fs/(2*pi));
end

Formants Tracking in Short-Time Frames

% NAME
%   spFormantsTrackLpc: Formants Tracking via the LPC Method 
% SYNOPSIS
%   [F, T] = spFormantsTrackLpc(x, fs, ncoef, 
%               frame_length, frame_overlap, window, show)
% DESCRIPTION
%   Formants Tracking via the LPC method
% INPUTS
%   x      (vector) of size Nx1.
%   fs     (scalar) the sampling rate in Hz. 
%   [ncoef](scalar) the number of LPC coefficients used for
%           estimation. ncoef = 2 + fs / 1000 is the default.
%   [frame_length]
%          (scalar) the length of each frame in micro second. 
%           The default is 30ms.  
%   [frame_overlap] 
%          (scalar) the length of each frame overlaps in micro second. 
%           The default is frame_length / 2. 
%   [window]
%          (string) the window function such as rectwin, hamming.  
%           if not specified, equivalent to hamming
%   [show] (boolean) plot or not. The default is 0.
% OUTPUTS
%   F      (vector) formants (fm = F(find(T == 0.01))
%   T      (vector) formant times
% AUTHOR
%   Naotoshi Seo, April 2006
function [F, T] = spFormantsTrackLpc(x, fs, ncoef, frame_length, frame_overlap, window, show)
 %% Initialization
 N = length(x);
 if ~exist('frame_length', 'var') || isempty(frame_length)
     frame_length = 30;
 end
 if ~exist('frame_overlap', 'var') || isempty(frame_overlap)
     frame_overlap = 20;
 end
 if ~exist('window', 'var') || isempty(window)
     window = 'hamming';
 end
 if ~exist('show', 'var') || isempty(show)
     show = 0;
 end
 if ~exist('ncoef', 'var')
     ncoef = [];
 end
 nsample = round(frame_length  * fs / 1000); % convert ms to points
 noverlap = round(frame_overlap * fs / 1000); % convert ms to points
 window   = eval(sprintf('%s(nsample)', window)); % e.g., hamming(nfft)

 pos = 1; t = 1;
 F = []; % formants
 T = []; % time (s) at the frame
 mid = round(nsample/2);
 while (pos+nsample <= N)
     frame = x(pos:pos+nsample-1);
     frame = frame - mean(frame);
     a = spLpc(frame, fs, ncoef);
     fm = spFormantsLpc(a, fs);
     for i=1:length(fm)
        F = [F fm(i)]; % number of formants are not same for each frame
        T = [T (pos+mid)/fs];
     end
     pos = pos + (nsample - noverlap);
     t = t + 1;
 end

 if show
     % plot waveform
     t=(0:N-1)/fs;
     subplot(2,1,1);
     plot(t,x);
     legend('Waveform');
     xlabel('Time (s)');
     ylabel('Amplitude');
     xlim([t(1) t(end)]);

     % plot formants trace
     subplot(2,1,2);
     plot(T, F, '.');
     hold off;
     legend('Formants');
     xlabel('Time (s)');
     ylabel('Frequency (Hz)');
     xlim([t(1) t(end)]);
 end
[x, fs] = wavread('bee.wav');
[F, T] = spFormantsTrackLpc(x, fs, [], 30, 20, 'hamming', 'plot');

References

  • [1] Xuedong Huang, Alex Acero, Hsiao-wuen hon, "Spoken Language Processing: A guide to Theory, Algorithm, and System Development," Prentice Hall, January 2001