diff options
| author | erdgeist@erdgeist.org <erdgeist@bauklotz.fritz.box> | 2019-07-04 23:26:09 +0200 |
|---|---|---|
| committer | erdgeist@erdgeist.org <erdgeist@bauklotz.fritz.box> | 2019-07-04 23:26:09 +0200 |
| commit | f02dfce6e6c34b3d8a7b8a0e784b506178e331fa (patch) | |
| tree | 45556e6104242d4702689760433d7321ae74ec17 /postfilter.c | |
stripdown of version 0.9
Diffstat (limited to 'postfilter.c')
| -rw-r--r-- | postfilter.c | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/postfilter.c b/postfilter.c new file mode 100644 index 0000000..6542c7c --- /dev/null +++ b/postfilter.c | |||
| @@ -0,0 +1,142 @@ | |||
| 1 | /*---------------------------------------------------------------------------*\ | ||
| 2 | |||
| 3 | FILE........: postfilter.c | ||
| 4 | AUTHOR......: David Rowe | ||
| 5 | DATE CREATED: 13/09/09 | ||
| 6 | |||
| 7 | Postfilter to improve sound quality for speech with high levels of | ||
| 8 | background noise. Unlike mixed-excitation models requires no bits | ||
| 9 | to be transmitted to handle background noise. | ||
| 10 | |||
| 11 | \*---------------------------------------------------------------------------*/ | ||
| 12 | |||
| 13 | /* | ||
| 14 | Copyright (C) 2009 David Rowe | ||
| 15 | |||
| 16 | All rights reserved. | ||
| 17 | |||
| 18 | This program is free software; you can redistribute it and/or modify | ||
| 19 | it under the terms of the GNU Lesser General Public License version 2.1, as | ||
| 20 | published by the Free Software Foundation. This program is | ||
| 21 | distributed in the hope that it will be useful, but WITHOUT ANY | ||
| 22 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 23 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | ||
| 24 | License for more details. | ||
| 25 | |||
| 26 | You should have received a copy of the GNU Lesser General Public License | ||
| 27 | along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 28 | */ | ||
| 29 | |||
| 30 | #include <assert.h> | ||
| 31 | #include <stdlib.h> | ||
| 32 | #include <stdio.h> | ||
| 33 | #include <math.h> | ||
| 34 | |||
| 35 | #include "defines.h" | ||
| 36 | #include "comp.h" | ||
| 37 | #include "dump.h" | ||
| 38 | #include "sine.h" | ||
| 39 | #include "postfilter.h" | ||
| 40 | |||
| 41 | /*---------------------------------------------------------------------------*\ | ||
| 42 | |||
| 43 | DEFINES | ||
| 44 | |||
| 45 | \*---------------------------------------------------------------------------*/ | ||
| 46 | |||
| 47 | #define BG_THRESH 40.0 /* only consider low levels signals for bg_est */ | ||
| 48 | #define BG_BETA 0.1 /* averaging filter constant */ | ||
| 49 | #define BG_MARGIN 6.0 /* harmonics this far above BG noise are | ||
| 50 | randomised. Helped make bg noise less | ||
| 51 | spikey (impulsive) for mmt1, but speech was | ||
| 52 | perhaps a little rougher. | ||
| 53 | */ | ||
| 54 | |||
| 55 | /*---------------------------------------------------------------------------*\ | ||
| 56 | |||
| 57 | postfilter() | ||
| 58 | |||
| 59 | The post filter is designed to help with speech corrupted by | ||
| 60 | background noise. The zero phase model tends to make speech with | ||
| 61 | background noise sound "clicky". With high levels of background | ||
| 62 | noise the low level inter-formant parts of the spectrum will contain | ||
| 63 | noise rather than speech harmonics, so modelling them as voiced | ||
| 64 | (i.e. a continuous, non-random phase track) is inaccurate. | ||
| 65 | |||
| 66 | Some codecs (like MBE) have a mixed voicing model that breaks the | ||
| 67 | spectrum into voiced and unvoiced regions. Several bits/frame | ||
| 68 | (5-12) are required to transmit the frequency selective voicing | ||
| 69 | information. Mixed excitation also requires accurate voicing | ||
| 70 | estimation (parameter estimators always break occasionally under | ||
| 71 | exceptional conditions). | ||
| 72 | |||
| 73 | In our case we use a post filter approach which requires no | ||
| 74 | additional bits to be transmitted. The decoder measures the average | ||
| 75 | level of the background noise during unvoiced frames. If a harmonic | ||
| 76 | is less than this level it is made unvoiced by randomising it's | ||
| 77 | phases. | ||
| 78 | |||
| 79 | This idea is rather experimental. Some potential problems that may | ||
| 80 | happen: | ||
| 81 | |||
| 82 | 1/ If someone says "aaaaaaaahhhhhhhhh" will background estimator track | ||
| 83 | up to speech level? This would be a bad thing. | ||
| 84 | |||
| 85 | 2/ If background noise suddenly dissapears from the source speech does | ||
| 86 | estimate drop quickly? What is noise suddenly re-appears? | ||
| 87 | |||
| 88 | 3/ Background noise with a non-flat sepctrum. Current algorithm just | ||
| 89 | comsiders scpetrum as a whole, but this could be broken up into | ||
| 90 | bands, each with their own estimator. | ||
| 91 | |||
| 92 | 4/ Males and females with the same level of background noise. Check | ||
| 93 | performance the same. Changing Wo affects width of each band, may | ||
| 94 | affect bg energy estimates. | ||
| 95 | |||
| 96 | 5/ Not sure what happens during long periods of voiced speech | ||
| 97 | e.g. "sshhhhhhh" | ||
| 98 | |||
| 99 | \*---------------------------------------------------------------------------*/ | ||
| 100 | |||
| 101 | void postfilter( | ||
| 102 | MODEL *model, | ||
| 103 | float *bg_est | ||
| 104 | ) | ||
| 105 | { | ||
| 106 | int m, uv; | ||
| 107 | float e, thresh; | ||
| 108 | |||
| 109 | /* determine average energy across spectrum */ | ||
| 110 | |||
| 111 | e = 1E-12; | ||
| 112 | for(m=1; m<=model->L; m++) | ||
| 113 | e += model->A[m]*model->A[m]; | ||
| 114 | |||
| 115 | assert(e > 0.0); | ||
| 116 | e = 10.0*log10f(e/model->L); | ||
| 117 | |||
| 118 | /* If beneath threhold, update bg estimate. The idea | ||
| 119 | of the threshold is to prevent updating during high level | ||
| 120 | speech. */ | ||
| 121 | |||
| 122 | if ((e < BG_THRESH) && !model->voiced) | ||
| 123 | *bg_est = *bg_est*(1.0 - BG_BETA) + e*BG_BETA; | ||
| 124 | |||
| 125 | /* now mess with phases during voiced frames to make any harmonics | ||
| 126 | less then our background estimate unvoiced. | ||
| 127 | */ | ||
| 128 | |||
| 129 | uv = 0; | ||
| 130 | thresh = POW10F((*bg_est + BG_MARGIN)/20.0); | ||
| 131 | if (model->voiced) | ||
| 132 | for(m=1; m<=model->L; m++) | ||
| 133 | if (model->A[m] < thresh) { | ||
| 134 | model->phi[m] = (TWO_PI/CODEC2_RAND_MAX)*(float)codec2_rand(); | ||
| 135 | uv++; | ||
| 136 | } | ||
| 137 | |||
| 138 | #ifdef DUMP | ||
| 139 | dump_bg(e, *bg_est, 100.0*uv/model->L); | ||
| 140 | #endif | ||
| 141 | |||
| 142 | } | ||
