diff options
Diffstat (limited to 'nlp.c')
| -rw-r--r-- | nlp.c | 702 |
1 files changed, 702 insertions, 0 deletions
| @@ -0,0 +1,702 @@ | |||
| 1 | /*---------------------------------------------------------------------------*\ | ||
| 2 | |||
| 3 | FILE........: nlp.c | ||
| 4 | AUTHOR......: David Rowe | ||
| 5 | DATE CREATED: 23/3/93 | ||
| 6 | |||
| 7 | Non Linear Pitch (NLP) estimation functions. | ||
| 8 | |||
| 9 | \*---------------------------------------------------------------------------*/ | ||
| 10 | |||
| 11 | /* | ||
| 12 | Copyright (C) 2009 David Rowe | ||
| 13 | |||
| 14 | All rights reserved. | ||
| 15 | |||
| 16 | This program is free software; you can redistribute it and/or modify | ||
| 17 | it under the terms of the GNU Lesser General Public License version 2.1, as | ||
| 18 | published by the Free Software Foundation. This program is | ||
| 19 | distributed in the hope that it will be useful, but WITHOUT ANY | ||
| 20 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 21 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | ||
| 22 | License for more details. | ||
| 23 | |||
| 24 | You should have received a copy of the GNU Lesser General Public License | ||
| 25 | along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 26 | */ | ||
| 27 | |||
| 28 | #include "defines.h" | ||
| 29 | #include "nlp.h" | ||
| 30 | #include "dump.h" | ||
| 31 | #include "codec2_fft.h" | ||
| 32 | #undef PROFILE | ||
| 33 | #include "machdep.h" | ||
| 34 | #include "os.h" | ||
| 35 | |||
| 36 | #include <assert.h> | ||
| 37 | #include <math.h> | ||
| 38 | #include <stdlib.h> | ||
| 39 | |||
| 40 | /*---------------------------------------------------------------------------*\ | ||
| 41 | |||
| 42 | DEFINES | ||
| 43 | |||
| 44 | \*---------------------------------------------------------------------------*/ | ||
| 45 | |||
| 46 | #define PMAX_M 320 /* maximum NLP analysis window size */ | ||
| 47 | #define COEFF 0.95 /* notch filter parameter */ | ||
| 48 | #define PE_FFT_SIZE 512 /* DFT size for pitch estimation */ | ||
| 49 | #define DEC 5 /* decimation factor */ | ||
| 50 | #define SAMPLE_RATE 8000 | ||
| 51 | #define PI 3.141592654 /* mathematical constant */ | ||
| 52 | #define T 0.1 /* threshold for local minima candidate */ | ||
| 53 | #define F0_MAX 500 | ||
| 54 | #define CNLP 0.3 /* post processor constant */ | ||
| 55 | #define NLP_NTAP 48 /* Decimation LPF order */ | ||
| 56 | #undef POST_PROCESS_MBE /* choose post processor */ | ||
| 57 | |||
| 58 | /* 8 to 16 kHz sample rate conversion */ | ||
| 59 | |||
| 60 | #define FDMDV_OS 2 /* oversampling rate */ | ||
| 61 | #define FDMDV_OS_TAPS_16K 48 /* number of OS filter taps at 16kHz */ | ||
| 62 | #define FDMDV_OS_TAPS_8K (FDMDV_OS_TAPS_16K/FDMDV_OS) /* number of OS filter taps at 8kHz */ | ||
| 63 | |||
| 64 | /*---------------------------------------------------------------------------*\ | ||
| 65 | |||
| 66 | GLOBALS | ||
| 67 | |||
| 68 | \*---------------------------------------------------------------------------*/ | ||
| 69 | |||
| 70 | /* 48 tap 600Hz low pass FIR filter coefficients */ | ||
| 71 | |||
| 72 | const float nlp_fir[] = { | ||
| 73 | -1.0818124e-03, | ||
| 74 | -1.1008344e-03, | ||
| 75 | -9.2768838e-04, | ||
| 76 | -4.2289438e-04, | ||
| 77 | 5.5034190e-04, | ||
| 78 | 2.0029849e-03, | ||
| 79 | 3.7058509e-03, | ||
| 80 | 5.1449415e-03, | ||
| 81 | 5.5924666e-03, | ||
| 82 | 4.3036754e-03, | ||
| 83 | 8.0284511e-04, | ||
| 84 | -4.8204610e-03, | ||
| 85 | -1.1705810e-02, | ||
| 86 | -1.8199275e-02, | ||
| 87 | -2.2065282e-02, | ||
| 88 | -2.0920610e-02, | ||
| 89 | -1.2808831e-02, | ||
| 90 | 3.2204775e-03, | ||
| 91 | 2.6683811e-02, | ||
| 92 | 5.5520624e-02, | ||
| 93 | 8.6305944e-02, | ||
| 94 | 1.1480192e-01, | ||
| 95 | 1.3674206e-01, | ||
| 96 | 1.4867556e-01, | ||
| 97 | 1.4867556e-01, | ||
| 98 | 1.3674206e-01, | ||
| 99 | 1.1480192e-01, | ||
| 100 | 8.6305944e-02, | ||
| 101 | 5.5520624e-02, | ||
| 102 | 2.6683811e-02, | ||
| 103 | 3.2204775e-03, | ||
| 104 | -1.2808831e-02, | ||
| 105 | -2.0920610e-02, | ||
| 106 | -2.2065282e-02, | ||
| 107 | -1.8199275e-02, | ||
| 108 | -1.1705810e-02, | ||
| 109 | -4.8204610e-03, | ||
| 110 | 8.0284511e-04, | ||
| 111 | 4.3036754e-03, | ||
| 112 | 5.5924666e-03, | ||
| 113 | 5.1449415e-03, | ||
| 114 | 3.7058509e-03, | ||
| 115 | 2.0029849e-03, | ||
| 116 | 5.5034190e-04, | ||
| 117 | -4.2289438e-04, | ||
| 118 | -9.2768838e-04, | ||
| 119 | -1.1008344e-03, | ||
| 120 | -1.0818124e-03 | ||
| 121 | }; | ||
| 122 | |||
| 123 | typedef struct { | ||
| 124 | int Fs; /* sample rate in Hz */ | ||
| 125 | int m; | ||
| 126 | float w[PMAX_M/DEC]; /* DFT window */ | ||
| 127 | float sq[PMAX_M]; /* squared speech samples */ | ||
| 128 | float mem_x,mem_y; /* memory for notch filter */ | ||
| 129 | float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */ | ||
| 130 | codec2_fft_cfg fft_cfg; /* kiss FFT config */ | ||
| 131 | float *Sn16k; /* Fs=16kHz input speech vector */ | ||
| 132 | FILE *f; | ||
| 133 | } NLP; | ||
| 134 | |||
| 135 | #ifdef POST_PROCESS_MBE | ||
| 136 | float test_candidate_mbe(COMP Sw[], COMP W[], float f0); | ||
| 137 | float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo); | ||
| 138 | #endif | ||
| 139 | float post_process_sub_multiples(COMP Fw[], | ||
| 140 | int pmin, int pmax, float gmax, int gmax_bin, | ||
| 141 | float *prev_f0); | ||
| 142 | static void fdmdv_16_to_8(float out8k[], float in16k[], int n); | ||
| 143 | |||
| 144 | /*---------------------------------------------------------------------------*\ | ||
| 145 | |||
| 146 | nlp_create() | ||
| 147 | |||
| 148 | Initialisation function for NLP pitch estimator. | ||
| 149 | |||
| 150 | \*---------------------------------------------------------------------------*/ | ||
| 151 | |||
| 152 | void *nlp_create(C2CONST *c2const) | ||
| 153 | { | ||
| 154 | NLP *nlp; | ||
| 155 | int i; | ||
| 156 | int m = c2const->m_pitch; | ||
| 157 | int Fs = c2const->Fs; | ||
| 158 | |||
| 159 | nlp = (NLP*)malloc(sizeof(NLP)); | ||
| 160 | if (nlp == NULL) | ||
| 161 | return NULL; | ||
| 162 | |||
| 163 | assert((Fs == 8000) || (Fs == 16000)); | ||
| 164 | nlp->Fs = Fs; | ||
| 165 | |||
| 166 | nlp->m = m; | ||
| 167 | |||
| 168 | /* if running at 16kHz allocate storage for decimating filter memory */ | ||
| 169 | |||
| 170 | if (Fs == 16000) { | ||
| 171 | nlp->Sn16k = (float*)malloc(sizeof(float)*(FDMDV_OS_TAPS_16K + c2const->n_samp)); | ||
| 172 | for(i=0; i<FDMDV_OS_TAPS_16K; i++) { | ||
| 173 | nlp->Sn16k[i] = 0.0; | ||
| 174 | } | ||
| 175 | if (nlp->Sn16k == NULL) { | ||
| 176 | free(nlp); | ||
| 177 | return NULL; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* most processing occurs at 8 kHz sample rate so halve m */ | ||
| 181 | |||
| 182 | m /= 2; | ||
| 183 | } | ||
| 184 | |||
| 185 | assert(m <= PMAX_M); | ||
| 186 | |||
| 187 | for(i=0; i<m/DEC; i++) { | ||
| 188 | nlp->w[i] = 0.5 - 0.5*cosf(2*PI*i/(m/DEC-1)); | ||
| 189 | } | ||
| 190 | |||
| 191 | for(i=0; i<PMAX_M; i++) | ||
| 192 | nlp->sq[i] = 0.0; | ||
| 193 | nlp->mem_x = 0.0; | ||
| 194 | nlp->mem_y = 0.0; | ||
| 195 | for(i=0; i<NLP_NTAP; i++) | ||
| 196 | nlp->mem_fir[i] = 0.0; | ||
| 197 | |||
| 198 | nlp->fft_cfg = codec2_fft_alloc (PE_FFT_SIZE, 0, NULL, NULL); | ||
| 199 | assert(nlp->fft_cfg != NULL); | ||
| 200 | |||
| 201 | return (void*)nlp; | ||
| 202 | } | ||
| 203 | |||
| 204 | /*---------------------------------------------------------------------------*\ | ||
| 205 | |||
| 206 | nlp_destroy() | ||
| 207 | |||
| 208 | Shut down function for NLP pitch estimator. | ||
| 209 | |||
| 210 | \*---------------------------------------------------------------------------*/ | ||
| 211 | |||
| 212 | void nlp_destroy(void *nlp_state) | ||
| 213 | { | ||
| 214 | NLP *nlp; | ||
| 215 | assert(nlp_state != NULL); | ||
| 216 | nlp = (NLP*)nlp_state; | ||
| 217 | |||
| 218 | codec2_fft_free(nlp->fft_cfg); | ||
| 219 | if (nlp->Fs == 16000) { | ||
| 220 | free(nlp->Sn16k); | ||
| 221 | } | ||
| 222 | free(nlp_state); | ||
| 223 | } | ||
| 224 | |||
| 225 | /*---------------------------------------------------------------------------*\ | ||
| 226 | |||
| 227 | nlp() | ||
| 228 | |||
| 229 | Determines the pitch in samples using the Non Linear Pitch (NLP) | ||
| 230 | algorithm [1]. Returns the fundamental in Hz. Note that the actual | ||
| 231 | pitch estimate is for the centre of the M sample Sn[] vector, not | ||
| 232 | the current N sample input vector. This is (I think) a delay of 2.5 | ||
| 233 | frames with N=80 samples. You should align further analysis using | ||
| 234 | this pitch estimate to be centred on the middle of Sn[]. | ||
| 235 | |||
| 236 | Two post processors have been tried, the MBE version (as discussed | ||
| 237 | in [1]), and a post processor that checks sub-multiples. Both | ||
| 238 | suffer occasional gross pitch errors (i.e. neither are perfect). In | ||
| 239 | the presence of background noise the sub-multiple algorithm tends | ||
| 240 | towards low F0 which leads to better sounding background noise than | ||
| 241 | the MBE post processor. | ||
| 242 | |||
| 243 | A good way to test and develop the NLP pitch estimator is using the | ||
| 244 | tnlp (codec2/unittest) and the codec2/octave/plnlp.m Octave script. | ||
| 245 | |||
| 246 | A pitch tracker searching a few frames forward and backward in time | ||
| 247 | would be a useful addition. | ||
| 248 | |||
| 249 | References: | ||
| 250 | |||
| 251 | [1] http://rowetel.com/downloads/1997_rowe_phd_thesis.pdf Chapter 4 | ||
| 252 | |||
| 253 | \*---------------------------------------------------------------------------*/ | ||
| 254 | |||
| 255 | float nlp( | ||
| 256 | void *nlp_state, | ||
| 257 | float Sn[], /* input speech vector */ | ||
| 258 | int n, /* frames shift (no. new samples in Sn[]) */ | ||
| 259 | float *pitch, /* estimated pitch period in samples at current Fs */ | ||
| 260 | COMP Sw[], /* Freq domain version of Sn[] */ | ||
| 261 | COMP W[], /* Freq domain window */ | ||
| 262 | float *prev_f0 /* previous pitch f0 in Hz, memory for pitch tracking */ | ||
| 263 | ) | ||
| 264 | { | ||
| 265 | NLP *nlp; | ||
| 266 | float notch; /* current notch filter output */ | ||
| 267 | COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal (input/output) */ | ||
| 268 | float gmax; | ||
| 269 | int gmax_bin; | ||
| 270 | int m, i, j; | ||
| 271 | float best_f0; | ||
| 272 | PROFILE_VAR(start, tnotch, filter, peakpick, window, fft, magsq, shiftmem); | ||
| 273 | |||
| 274 | assert(nlp_state != NULL); | ||
| 275 | nlp = (NLP*)nlp_state; | ||
| 276 | m = nlp->m; | ||
| 277 | |||
| 278 | /* Square, notch filter at DC, and LP filter vector */ | ||
| 279 | |||
| 280 | /* If running at 16 kHz decimate to 8 kHz, as NLP ws designed for | ||
| 281 | Fs = 8kHz. The decimating filter introduces about 3ms of delay, | ||
| 282 | that shouldn't be a problem as pitch changes slowly. */ | ||
| 283 | |||
| 284 | if (nlp->Fs == 8000) { | ||
| 285 | /* Square latest input samples */ | ||
| 286 | |||
| 287 | for(i=m-n; i<m; i++) { | ||
| 288 | nlp->sq[i] = Sn[i]*Sn[i]; | ||
| 289 | } | ||
| 290 | } | ||
| 291 | else { | ||
| 292 | assert(nlp->Fs == 16000); | ||
| 293 | |||
| 294 | /* re-sample at 8 KHz */ | ||
| 295 | |||
| 296 | for(i=0; i<n; i++) { | ||
| 297 | nlp->Sn16k[FDMDV_OS_TAPS_16K+i] = Sn[m-n+i]; | ||
| 298 | } | ||
| 299 | |||
| 300 | m /= 2; n /= 2; | ||
| 301 | |||
| 302 | float Sn8k[n]; | ||
| 303 | fdmdv_16_to_8(Sn8k, &nlp->Sn16k[FDMDV_OS_TAPS_16K], n); | ||
| 304 | |||
| 305 | /* Square latest input samples */ | ||
| 306 | |||
| 307 | for(i=m-n, j=0; i<m; i++, j++) { | ||
| 308 | nlp->sq[i] = Sn8k[j]*Sn8k[j]; | ||
| 309 | } | ||
| 310 | assert(j <= n); | ||
| 311 | } | ||
| 312 | //fprintf(stderr, "n: %d m: %d\n", n, m); | ||
| 313 | |||
| 314 | PROFILE_SAMPLE(start); | ||
| 315 | |||
| 316 | for(i=m-n; i<m; i++) { /* notch filter at DC */ | ||
| 317 | notch = nlp->sq[i] - nlp->mem_x; | ||
| 318 | notch += COEFF*nlp->mem_y; | ||
| 319 | nlp->mem_x = nlp->sq[i]; | ||
| 320 | nlp->mem_y = notch; | ||
| 321 | nlp->sq[i] = notch + 1.0; /* With 0 input vectors to codec, | ||
| 322 | kiss_fft() would take a long | ||
| 323 | time to execute when running in | ||
| 324 | real time. Problem was traced | ||
| 325 | to kiss_fft function call in | ||
| 326 | this function. Adding this small | ||
| 327 | constant fixed problem. Not | ||
| 328 | exactly sure why. */ | ||
| 329 | } | ||
| 330 | |||
| 331 | PROFILE_SAMPLE_AND_LOG(tnotch, start, " square and notch"); | ||
| 332 | |||
| 333 | for(i=m-n; i<m; i++) { /* FIR filter vector */ | ||
| 334 | |||
| 335 | for(j=0; j<NLP_NTAP-1; j++) | ||
| 336 | nlp->mem_fir[j] = nlp->mem_fir[j+1]; | ||
| 337 | nlp->mem_fir[NLP_NTAP-1] = nlp->sq[i]; | ||
| 338 | |||
| 339 | nlp->sq[i] = 0.0; | ||
| 340 | for(j=0; j<NLP_NTAP; j++) | ||
| 341 | nlp->sq[i] += nlp->mem_fir[j]*nlp_fir[j]; | ||
| 342 | } | ||
| 343 | |||
| 344 | PROFILE_SAMPLE_AND_LOG(filter, tnotch, " filter"); | ||
| 345 | |||
| 346 | /* Decimate and DFT */ | ||
| 347 | |||
| 348 | for(i=0; i<PE_FFT_SIZE; i++) { | ||
| 349 | Fw[i].real = 0.0; | ||
| 350 | Fw[i].imag = 0.0; | ||
| 351 | } | ||
| 352 | for(i=0; i<m/DEC; i++) { | ||
| 353 | Fw[i].real = nlp->sq[i*DEC]*nlp->w[i]; | ||
| 354 | } | ||
| 355 | PROFILE_SAMPLE_AND_LOG(window, filter, " window"); | ||
| 356 | #ifdef DUMP | ||
| 357 | dump_dec(Fw); | ||
| 358 | #endif | ||
| 359 | |||
| 360 | // FIXME: check if this can be converted to a real fft | ||
| 361 | // since all imag inputs are 0 | ||
| 362 | codec2_fft_inplace(nlp->fft_cfg, Fw); | ||
| 363 | PROFILE_SAMPLE_AND_LOG(fft, window, " fft"); | ||
| 364 | |||
| 365 | for(i=0; i<PE_FFT_SIZE; i++) | ||
| 366 | Fw[i].real = Fw[i].real*Fw[i].real + Fw[i].imag*Fw[i].imag; | ||
| 367 | |||
| 368 | PROFILE_SAMPLE_AND_LOG(magsq, fft, " mag sq"); | ||
| 369 | #ifdef DUMP | ||
| 370 | dump_sq(m, nlp->sq); | ||
| 371 | dump_Fw(Fw); | ||
| 372 | #endif | ||
| 373 | |||
| 374 | /* todo: express everything in f0, as pitch in samples is dep on Fs */ | ||
| 375 | |||
| 376 | int pmin = floor(SAMPLE_RATE*P_MIN_S); | ||
| 377 | int pmax = floor(SAMPLE_RATE*P_MAX_S); | ||
| 378 | |||
| 379 | /* find global peak */ | ||
| 380 | |||
| 381 | gmax = 0.0; | ||
| 382 | gmax_bin = PE_FFT_SIZE*DEC/pmax; | ||
| 383 | for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) { | ||
| 384 | if (Fw[i].real > gmax) { | ||
| 385 | gmax = Fw[i].real; | ||
| 386 | gmax_bin = i; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | |||
| 390 | PROFILE_SAMPLE_AND_LOG(peakpick, magsq, " peak pick"); | ||
| 391 | |||
| 392 | #ifdef POST_PROCESS_MBE | ||
| 393 | best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_f0); | ||
| 394 | #else | ||
| 395 | best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_f0); | ||
| 396 | #endif | ||
| 397 | |||
| 398 | PROFILE_SAMPLE_AND_LOG(shiftmem, peakpick, " post process"); | ||
| 399 | |||
| 400 | /* Shift samples in buffer to make room for new samples */ | ||
| 401 | |||
| 402 | for(i=0; i<m-n; i++) | ||
| 403 | nlp->sq[i] = nlp->sq[i+n]; | ||
| 404 | |||
| 405 | /* return pitch period in samples and F0 estimate */ | ||
| 406 | |||
| 407 | *pitch = (float)nlp->Fs/best_f0; | ||
| 408 | |||
| 409 | PROFILE_SAMPLE_AND_LOG2(shiftmem, " shift mem"); | ||
| 410 | |||
| 411 | PROFILE_SAMPLE_AND_LOG2(start, " nlp int"); | ||
| 412 | |||
| 413 | *prev_f0 = best_f0; | ||
| 414 | |||
| 415 | return(best_f0); | ||
| 416 | } | ||
| 417 | |||
| 418 | /*---------------------------------------------------------------------------*\ | ||
| 419 | |||
| 420 | post_process_sub_multiples() | ||
| 421 | |||
| 422 | Given the global maximma of Fw[] we search integer submultiples for | ||
| 423 | local maxima. If local maxima exist and they are above an | ||
| 424 | experimentally derived threshold (OK a magic number I pulled out of | ||
| 425 | the air) we choose the submultiple as the F0 estimate. | ||
| 426 | |||
| 427 | The rational for this is that the lowest frequency peak of Fw[] | ||
| 428 | should be F0, as Fw[] can be considered the autocorrelation function | ||
| 429 | of Sw[] (the speech spectrum). However sometimes due to phase | ||
| 430 | effects the lowest frequency maxima may not be the global maxima. | ||
| 431 | |||
| 432 | This works OK in practice and favours low F0 values in the presence | ||
| 433 | of background noise which means the sinusoidal codec does an OK job | ||
| 434 | of synthesising the background noise. High F0 in background noise | ||
| 435 | tends to sound more periodic introducing annoying artifacts. | ||
| 436 | |||
| 437 | \*---------------------------------------------------------------------------*/ | ||
| 438 | |||
| 439 | float post_process_sub_multiples(COMP Fw[], | ||
| 440 | int pmin, int pmax, float gmax, int gmax_bin, | ||
| 441 | float *prev_f0) | ||
| 442 | { | ||
| 443 | int min_bin, cmax_bin; | ||
| 444 | int mult; | ||
| 445 | float thresh, best_f0; | ||
| 446 | int b, bmin, bmax, lmax_bin; | ||
| 447 | float lmax; | ||
| 448 | int prev_f0_bin; | ||
| 449 | |||
| 450 | /* post process estimate by searching submultiples */ | ||
| 451 | |||
| 452 | mult = 2; | ||
| 453 | min_bin = PE_FFT_SIZE*DEC/pmax; | ||
| 454 | cmax_bin = gmax_bin; | ||
| 455 | prev_f0_bin = *prev_f0*(PE_FFT_SIZE*DEC)/SAMPLE_RATE; | ||
| 456 | |||
| 457 | while(gmax_bin/mult >= min_bin) { | ||
| 458 | |||
| 459 | b = gmax_bin/mult; /* determine search interval */ | ||
| 460 | bmin = 0.8*b; | ||
| 461 | bmax = 1.2*b; | ||
| 462 | if (bmin < min_bin) | ||
| 463 | bmin = min_bin; | ||
| 464 | |||
| 465 | /* lower threshold to favour previous frames pitch estimate, | ||
| 466 | this is a form of pitch tracking */ | ||
| 467 | |||
| 468 | if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax)) | ||
| 469 | thresh = CNLP*0.5*gmax; | ||
| 470 | else | ||
| 471 | thresh = CNLP*gmax; | ||
| 472 | |||
| 473 | lmax = 0; | ||
| 474 | lmax_bin = bmin; | ||
| 475 | for (b=bmin; b<=bmax; b++) /* look for maximum in interval */ | ||
| 476 | if (Fw[b].real > lmax) { | ||
| 477 | lmax = Fw[b].real; | ||
| 478 | lmax_bin = b; | ||
| 479 | } | ||
| 480 | |||
| 481 | if (lmax > thresh) | ||
| 482 | if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) { | ||
| 483 | cmax_bin = lmax_bin; | ||
| 484 | } | ||
| 485 | |||
| 486 | mult++; | ||
| 487 | } | ||
| 488 | |||
| 489 | best_f0 = (float)cmax_bin*SAMPLE_RATE/(PE_FFT_SIZE*DEC); | ||
| 490 | |||
| 491 | return best_f0; | ||
| 492 | } | ||
| 493 | |||
| 494 | #ifdef POST_PROCESS_MBE | ||
| 495 | |||
| 496 | /*---------------------------------------------------------------------------*\ | ||
| 497 | |||
| 498 | post_process_mbe() | ||
| 499 | |||
| 500 | Use the MBE pitch estimation algorithm to evaluate pitch candidates. This | ||
| 501 | works OK but the accuracy at low F0 is affected by NW, the analysis window | ||
| 502 | size used for the DFT of the input speech Sw[]. Also favours high F0 in | ||
| 503 | the presence of background noise which causes periodic artifacts in the | ||
| 504 | synthesised speech. | ||
| 505 | |||
| 506 | \*---------------------------------------------------------------------------*/ | ||
| 507 | |||
| 508 | float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo) | ||
| 509 | { | ||
| 510 | float candidate_f0; | ||
| 511 | float f0,best_f0; /* fundamental frequency */ | ||
| 512 | float e,e_min; /* MBE cost function */ | ||
| 513 | int i; | ||
| 514 | #ifdef DUMP | ||
| 515 | float e_hz[F0_MAX]; | ||
| 516 | #endif | ||
| 517 | #if !defined(NDEBUG) || defined(DUMP) | ||
| 518 | int bin; | ||
| 519 | #endif | ||
| 520 | float f0_min, f0_max; | ||
| 521 | float f0_start, f0_end; | ||
| 522 | |||
| 523 | f0_min = (float)SAMPLE_RATE/pmax; | ||
| 524 | f0_max = (float)SAMPLE_RATE/pmin; | ||
| 525 | |||
| 526 | /* Now look for local maxima. Each local maxima is a candidate | ||
| 527 | that we test using the MBE pitch estimation algotithm */ | ||
| 528 | |||
| 529 | #ifdef DUMP | ||
| 530 | for(i=0; i<F0_MAX; i++) | ||
| 531 | e_hz[i] = -1; | ||
| 532 | #endif | ||
| 533 | e_min = 1E32; | ||
| 534 | best_f0 = 50; | ||
| 535 | for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) { | ||
| 536 | if ((Fw[i].real > Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) { | ||
| 537 | |||
| 538 | /* local maxima found, lets test if it's big enough */ | ||
| 539 | |||
| 540 | if (Fw[i].real > T*gmax) { | ||
| 541 | |||
| 542 | /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */ | ||
| 543 | |||
| 544 | candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC); | ||
| 545 | f0_start = candidate_f0-20; | ||
| 546 | f0_end = candidate_f0+20; | ||
| 547 | if (f0_start < f0_min) f0_start = f0_min; | ||
| 548 | if (f0_end > f0_max) f0_end = f0_max; | ||
| 549 | |||
| 550 | for(f0=f0_start; f0<=f0_end; f0+= 2.5) { | ||
| 551 | e = test_candidate_mbe(Sw, W, f0); | ||
| 552 | #if !defined(NDEBUG) || defined(DUMP) | ||
| 553 | bin = floorf(f0); assert((bin > 0) && (bin < F0_MAX)); | ||
| 554 | #endif | ||
| 555 | #ifdef DUMP | ||
| 556 | e_hz[bin] = e; | ||
| 557 | #endif | ||
| 558 | if (e < e_min) { | ||
| 559 | e_min = e; | ||
| 560 | best_f0 = f0; | ||
| 561 | } | ||
| 562 | } | ||
| 563 | |||
| 564 | } | ||
| 565 | } | ||
| 566 | } | ||
| 567 | |||
| 568 | /* finally sample MBE cost function around previous pitch estimate | ||
| 569 | (form of pitch tracking) */ | ||
| 570 | |||
| 571 | candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI; | ||
| 572 | f0_start = candidate_f0-20; | ||
| 573 | f0_end = candidate_f0+20; | ||
| 574 | if (f0_start < f0_min) f0_start = f0_min; | ||
| 575 | if (f0_end > f0_max) f0_end = f0_max; | ||
| 576 | |||
| 577 | for(f0=f0_start; f0<=f0_end; f0+= 2.5) { | ||
| 578 | e = test_candidate_mbe(Sw, W, f0); | ||
| 579 | #if !defined(NDEBUG) || defined(DUMP) | ||
| 580 | bin = floorf(f0); assert((bin > 0) && (bin < F0_MAX)); | ||
| 581 | #endif | ||
| 582 | #ifdef DUMP | ||
| 583 | e_hz[bin] = e; | ||
| 584 | #endif | ||
| 585 | if (e < e_min) { | ||
| 586 | e_min = e; | ||
| 587 | best_f0 = f0; | ||
| 588 | } | ||
| 589 | } | ||
| 590 | |||
| 591 | #ifdef DUMP | ||
| 592 | dump_e(e_hz); | ||
| 593 | #endif | ||
| 594 | |||
| 595 | return best_f0; | ||
| 596 | } | ||
| 597 | |||
| 598 | /*---------------------------------------------------------------------------*\ | ||
| 599 | |||
| 600 | test_candidate_mbe() | ||
| 601 | |||
| 602 | Returns the error of the MBE cost function for the input f0. | ||
| 603 | |||
| 604 | Note: I think a lot of the operations below can be simplified as | ||
| 605 | W[].imag = 0 and has been normalised such that den always equals 1. | ||
| 606 | |||
| 607 | \*---------------------------------------------------------------------------*/ | ||
| 608 | |||
| 609 | float test_candidate_mbe( | ||
| 610 | COMP Sw[], | ||
| 611 | COMP W[], | ||
| 612 | float f0 | ||
| 613 | ) | ||
| 614 | { | ||
| 615 | COMP Sw_[FFT_ENC]; /* DFT of all voiced synthesised signal */ | ||
| 616 | int l,al,bl,m; /* loop variables */ | ||
| 617 | COMP Am; /* amplitude sample for this band */ | ||
| 618 | int offset; /* centers Hw[] about current harmonic */ | ||
| 619 | float den; /* denominator of Am expression */ | ||
| 620 | float error; /* accumulated error between originl and synthesised */ | ||
| 621 | float Wo; /* current "test" fundamental freq. */ | ||
| 622 | int L; | ||
| 623 | |||
| 624 | L = floorf((SAMPLE_RATE/2.0)/f0); | ||
| 625 | Wo = f0*(2*PI/SAMPLE_RATE); | ||
| 626 | |||
| 627 | error = 0.0; | ||
| 628 | |||
| 629 | /* Just test across the harmonics in the first 1000 Hz (L/4) */ | ||
| 630 | |||
| 631 | for(l=1; l<L/4; l++) { | ||
| 632 | Am.real = 0.0; | ||
| 633 | Am.imag = 0.0; | ||
| 634 | den = 0.0; | ||
| 635 | al = ceilf((l - 0.5)*Wo*FFT_ENC/TWO_PI); | ||
| 636 | bl = ceilf((l + 0.5)*Wo*FFT_ENC/TWO_PI); | ||
| 637 | |||
| 638 | /* Estimate amplitude of harmonic assuming harmonic is totally voiced */ | ||
| 639 | |||
| 640 | for(m=al; m<bl; m++) { | ||
| 641 | offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5; | ||
| 642 | Am.real += Sw[m].real*W[offset].real + Sw[m].imag*W[offset].imag; | ||
| 643 | Am.imag += Sw[m].imag*W[offset].real - Sw[m].real*W[offset].imag; | ||
| 644 | den += W[offset].real*W[offset].real + W[offset].imag*W[offset].imag; | ||
| 645 | } | ||
| 646 | |||
| 647 | Am.real = Am.real/den; | ||
| 648 | Am.imag = Am.imag/den; | ||
| 649 | |||
| 650 | /* Determine error between estimated harmonic and original */ | ||
| 651 | |||
| 652 | for(m=al; m<bl; m++) { | ||
| 653 | offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5; | ||
| 654 | Sw_[m].real = Am.real*W[offset].real - Am.imag*W[offset].imag; | ||
| 655 | Sw_[m].imag = Am.real*W[offset].imag + Am.imag*W[offset].real; | ||
| 656 | error += (Sw[m].real - Sw_[m].real)*(Sw[m].real - Sw_[m].real); | ||
| 657 | error += (Sw[m].imag - Sw_[m].imag)*(Sw[m].imag - Sw_[m].imag); | ||
| 658 | } | ||
| 659 | } | ||
| 660 | |||
| 661 | return error; | ||
| 662 | } | ||
| 663 | |||
| 664 | #endif | ||
| 665 | |||
| 666 | /*---------------------------------------------------------------------------*\ | ||
| 667 | |||
| 668 | FUNCTION....: fdmdv_16_to_8() | ||
| 669 | AUTHOR......: David Rowe | ||
| 670 | DATE CREATED: 9 May 2012 | ||
| 671 | |||
| 672 | Changes the sample rate of a signal from 16 to 8 kHz. | ||
| 673 | |||
| 674 | n is the number of samples at the 8 kHz rate, there are FDMDV_OS*n | ||
| 675 | samples at the 48 kHz rate. As above however a memory of | ||
| 676 | FDMDV_OS_TAPS samples is reqd for in16k[] (see t16_8.c unit test as example). | ||
| 677 | |||
| 678 | Low pass filter the 16 kHz signal at 4 kHz using the same filter as | ||
| 679 | the upsampler, then just output every FDMDV_OS-th filtered sample. | ||
| 680 | |||
| 681 | Note: this function copied from fdmdv.c, included in nlp.c as a convenience | ||
| 682 | to avoid linking with another source file. | ||
| 683 | |||
| 684 | \*---------------------------------------------------------------------------*/ | ||
| 685 | |||
| 686 | static void fdmdv_16_to_8(float out8k[], float in16k[], int n) | ||
| 687 | { | ||
| 688 | float acc; | ||
| 689 | int i,j,k; | ||
| 690 | |||
| 691 | for(i=0, k=0; k<n; i+=FDMDV_OS, k++) { | ||
| 692 | acc = 0.0; | ||
| 693 | for(j=0; j<FDMDV_OS_TAPS_16K; j++) | ||
| 694 | acc += fdmdv_os_filter[j]*in16k[i-j]; | ||
| 695 | out8k[k] = acc; | ||
| 696 | } | ||
| 697 | |||
| 698 | /* update filter memory */ | ||
| 699 | |||
| 700 | for(i=-FDMDV_OS_TAPS_16K; i<0; i++) | ||
| 701 | in16k[i] = in16k[i + n*FDMDV_OS]; | ||
| 702 | } | ||
