diff options
Diffstat (limited to 'phase.c')
-rw-r--r-- | phase.c | 289 |
1 files changed, 289 insertions, 0 deletions
@@ -0,0 +1,289 @@ | |||
1 | /*---------------------------------------------------------------------------*\ | ||
2 | |||
3 | FILE........: phase.c | ||
4 | AUTHOR......: David Rowe | ||
5 | DATE CREATED: 1/2/09 | ||
6 | |||
7 | Functions for modelling and synthesising phase. | ||
8 | |||
9 | \*---------------------------------------------------------------------------*/ | ||
10 | |||
11 | /* | ||
12 | Copyright (C) 2009 David Rowe | ||
13 | |||
14 | All rights reserved. | ||
15 | |||
16 | This program is free software; you can redistribute it and/or modify | ||
17 | it under the terms of the GNU Lesser General Public License version 2.1, as | ||
18 | published by the Free Software Foundation. This program is | ||
19 | distributed in the hope that it will be useful, but WITHOUT ANY | ||
20 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
21 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | ||
22 | License for more details. | ||
23 | |||
24 | You should have received a copy of the GNU Lesser General Public License | ||
25 | along with this program; if not,see <http://www.gnu.org/licenses/>. | ||
26 | */ | ||
27 | |||
28 | #include "defines.h" | ||
29 | #include "phase.h" | ||
30 | #include "kiss_fft.h" | ||
31 | #include "comp.h" | ||
32 | #include "comp_prim.h" | ||
33 | #include "sine.h" | ||
34 | |||
35 | #include <assert.h> | ||
36 | #include <ctype.h> | ||
37 | #include <math.h> | ||
38 | #include <string.h> | ||
39 | #include <stdlib.h> | ||
40 | |||
41 | /*---------------------------------------------------------------------------*\ | ||
42 | |||
43 | sample_phase() | ||
44 | |||
45 | Samples phase at centre of each harmonic from and array of FFT_ENC | ||
46 | DFT samples. | ||
47 | |||
48 | \*---------------------------------------------------------------------------*/ | ||
49 | |||
50 | void sample_phase(MODEL *model, | ||
51 | COMP H[], | ||
52 | COMP A[] /* LPC analysis filter in freq domain */ | ||
53 | ) | ||
54 | { | ||
55 | int m, b; | ||
56 | float r; | ||
57 | |||
58 | r = TWO_PI/(FFT_ENC); | ||
59 | |||
60 | /* Sample phase at harmonics */ | ||
61 | |||
62 | for(m=1; m<=model->L; m++) { | ||
63 | b = (int)(m*model->Wo/r + 0.5); | ||
64 | H[m] = cconj(A[b]); /* synth filter 1/A is opposite phase to analysis filter */ | ||
65 | } | ||
66 | } | ||
67 | |||
68 | |||
69 | /*---------------------------------------------------------------------------*\ | ||
70 | |||
71 | phase_synth_zero_order() | ||
72 | |||
73 | Synthesises phases based on SNR and a rule based approach. No phase | ||
74 | parameters are required apart from the SNR (which can be reduced to a | ||
75 | 1 bit V/UV decision per frame). | ||
76 | |||
77 | The phase of each harmonic is modelled as the phase of a synthesis | ||
78 | filter excited by an impulse. In many Codec 2 modes the synthesis | ||
79 | filter is a LPC filter. Unlike the first order model the position | ||
80 | of the impulse is not transmitted, so we create an excitation pulse | ||
81 | train using a rule based approach. | ||
82 | |||
83 | Consider a pulse train with a pulse starting time n=0, with pulses | ||
84 | repeated at a rate of Wo, the fundamental frequency. A pulse train | ||
85 | in the time domain is equivalent to harmonics in the frequency | ||
86 | domain. We can make an excitation pulse train using a sum of | ||
87 | sinsusoids: | ||
88 | |||
89 | for(m=1; m<=L; m++) | ||
90 | ex[n] = cos(m*Wo*n) | ||
91 | |||
92 | Note: the Octave script ../octave/phase.m is an example of this if | ||
93 | you would like to try making a pulse train. | ||
94 | |||
95 | The phase of each excitation harmonic is: | ||
96 | |||
97 | arg(E[m]) = mWo | ||
98 | |||
99 | where E[m] are the complex excitation (freq domain) samples, | ||
100 | arg(x), just returns the phase of a complex sample x. | ||
101 | |||
102 | As we don't transmit the pulse position for this model, we need to | ||
103 | synthesise it. Now the excitation pulses occur at a rate of Wo. | ||
104 | This means the phase of the first harmonic advances by N_SAMP samples | ||
105 | over a synthesis frame of N_SAMP samples. For example if Wo is pi/20 | ||
106 | (200 Hz), then over a 10ms frame (N_SAMP=80 samples), the phase of the | ||
107 | first harmonic would advance (pi/20)*80 = 4*pi or two complete | ||
108 | cycles. | ||
109 | |||
110 | We generate the excitation phase of the fundamental (first | ||
111 | harmonic): | ||
112 | |||
113 | arg[E[1]] = Wo*N_SAMP; | ||
114 | |||
115 | We then relate the phase of the m-th excitation harmonic to the | ||
116 | phase of the fundamental as: | ||
117 | |||
118 | arg(E[m]) = m*arg(E[1]) | ||
119 | |||
120 | This E[m] then gets passed through the LPC synthesis filter to | ||
121 | determine the final harmonic phase. | ||
122 | |||
123 | Comparing to speech synthesised using original phases: | ||
124 | |||
125 | - Through headphones speech synthesised with this model is not as | ||
126 | good. Through a loudspeaker it is very close to original phases. | ||
127 | |||
128 | - If there are voicing errors, the speech can sound clicky or | ||
129 | staticy. If V speech is mistakenly declared UV, this model tends to | ||
130 | synthesise impulses or clicks, as there is usually very little shift or | ||
131 | dispersion through the LPC synthesis filter. | ||
132 | |||
133 | - When combined with LPC amplitude modelling there is an additional | ||
134 | drop in quality. I am not sure why, theory is interformant energy | ||
135 | is raised making any phase errors more obvious. | ||
136 | |||
137 | NOTES: | ||
138 | |||
139 | 1/ This synthesis model is effectively the same as a simple LPC-10 | ||
140 | vocoders, and yet sounds much better. Why? Conventional wisdom | ||
141 | (AMBE, MELP) says mixed voicing is required for high quality | ||
142 | speech. | ||
143 | |||
144 | 2/ I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE | ||
145 | also from MIT) first described this zero phase model, I need to look | ||
146 | up the paper. | ||
147 | |||
148 | 3/ Note that this approach could cause some discontinuities in | ||
149 | the phase at the edge of synthesis frames, as no attempt is made | ||
150 | to make sure that the phase tracks are continuous (the excitation | ||
151 | phases are continuous, but not the final phases after filtering | ||
152 | by the LPC spectra). Technically this is a bad thing. However | ||
153 | this may actually be a good thing, disturbing the phase tracks a | ||
154 | bit. More research needed, e.g. test a synthesis model that adds | ||
155 | a small delta-W to make phase tracks line up for voiced | ||
156 | harmonics. | ||
157 | |||
158 | \*---------------------------------------------------------------------------*/ | ||
159 | |||
160 | void phase_synth_zero_order( | ||
161 | int n_samp, | ||
162 | MODEL *model, | ||
163 | float *ex_phase, /* excitation phase of fundamental */ | ||
164 | COMP H[] /* L synthesis filter freq domain samples */ | ||
165 | |||
166 | ) | ||
167 | { | ||
168 | int m; | ||
169 | float new_phi; | ||
170 | COMP Ex[MAX_AMP+1]; /* excitation samples */ | ||
171 | COMP A_[MAX_AMP+1]; /* synthesised harmonic samples */ | ||
172 | |||
173 | /* | ||
174 | Update excitation fundamental phase track, this sets the position | ||
175 | of each pitch pulse during voiced speech. After much experiment | ||
176 | I found that using just this frame's Wo improved quality for UV | ||
177 | sounds compared to interpolating two frames Wo like this: | ||
178 | |||
179 | ex_phase[0] += (*prev_Wo+model->Wo)*N_SAMP/2; | ||
180 | */ | ||
181 | |||
182 | ex_phase[0] += (model->Wo)*n_samp; | ||
183 | ex_phase[0] -= TWO_PI*floorf(ex_phase[0]/TWO_PI + 0.5); | ||
184 | |||
185 | for(m=1; m<=model->L; m++) { | ||
186 | |||
187 | /* generate excitation */ | ||
188 | |||
189 | if (model->voiced) { | ||
190 | |||
191 | Ex[m].real = cosf(ex_phase[0]*m); | ||
192 | Ex[m].imag = sinf(ex_phase[0]*m); | ||
193 | } | ||
194 | else { | ||
195 | |||
196 | /* When a few samples were tested I found that LPC filter | ||
197 | phase is not needed in the unvoiced case, but no harm in | ||
198 | keeping it. | ||
199 | */ | ||
200 | float phi = TWO_PI*(float)codec2_rand()/CODEC2_RAND_MAX; | ||
201 | Ex[m].real = cosf(phi); | ||
202 | Ex[m].imag = sinf(phi); | ||
203 | } | ||
204 | |||
205 | /* filter using LPC filter */ | ||
206 | |||
207 | A_[m].real = H[m].real*Ex[m].real - H[m].imag*Ex[m].imag; | ||
208 | A_[m].imag = H[m].imag*Ex[m].real + H[m].real*Ex[m].imag; | ||
209 | |||
210 | /* modify sinusoidal phase */ | ||
211 | |||
212 | new_phi = atan2f(A_[m].imag, A_[m].real+1E-12); | ||
213 | model->phi[m] = new_phi; | ||
214 | } | ||
215 | |||
216 | } | ||
217 | |||
218 | |||
219 | /*---------------------------------------------------------------------------*\ | ||
220 | |||
221 | FUNCTION....: mag_to_phase | ||
222 | AUTHOR......: David Rowe | ||
223 | DATE CREATED: Jan 2017 | ||
224 | |||
225 | Algorithm for http://www.dsprelated.com/showcode/20.php ported to C. See | ||
226 | also Octave function mag_to_phase.m | ||
227 | |||
228 | Given a magnitude spectrum in dB, returns a minimum-phase phase | ||
229 | spectra. | ||
230 | |||
231 | \*---------------------------------------------------------------------------*/ | ||
232 | |||
233 | void mag_to_phase(float phase[], /* Nfft/2+1 output phase samples in radians */ | ||
234 | float Gdbfk[], /* Nfft/2+1 postive freq amplitudes samples in dB */ | ||
235 | int Nfft, | ||
236 | codec2_fft_cfg fft_fwd_cfg, | ||
237 | codec2_fft_cfg fft_inv_cfg | ||
238 | ) | ||
239 | { | ||
240 | COMP Sdb[Nfft], c[Nfft], cf[Nfft], Cf[Nfft]; | ||
241 | int Ns = Nfft/2+1; | ||
242 | int i; | ||
243 | |||
244 | /* install negative frequency components, 1/Nfft takes into | ||
245 | account kiss fft lack of scaling on ifft */ | ||
246 | |||
247 | Sdb[0].real = Gdbfk[0]; | ||
248 | Sdb[0].imag = 0.0; | ||
249 | for(i=1; i<Ns; i++) { | ||
250 | Sdb[i].real = Sdb[Nfft-i].real = Gdbfk[i]; | ||
251 | Sdb[i].imag = Sdb[Nfft-i].imag = 0.0; | ||
252 | } | ||
253 | |||
254 | /* compute real cepstrum from log magnitude spectrum */ | ||
255 | |||
256 | codec2_fft(fft_inv_cfg, Sdb, c); | ||
257 | for(i=0; i<Nfft; i++) { | ||
258 | c[i].real /= (float)Nfft; | ||
259 | c[i].imag /= (float)Nfft; | ||
260 | } | ||
261 | |||
262 | /* Fold cepstrum to reflect non-min-phase zeros inside unit circle */ | ||
263 | |||
264 | cf[0] = c[0]; | ||
265 | for(i=1; i<Ns-1; i++) { | ||
266 | cf[i] = cadd(c[i],c[Nfft-i]); | ||
267 | } | ||
268 | cf[Ns-1] = c[Ns-1]; | ||
269 | for(i=Ns; i<Nfft; i++) { | ||
270 | cf[i].real = 0.0; | ||
271 | cf[i].imag = 0.0; | ||
272 | } | ||
273 | |||
274 | /* Cf = dB_magnitude + j * minimum_phase */ | ||
275 | |||
276 | codec2_fft(fft_fwd_cfg, cf, Cf); | ||
277 | |||
278 | /* The maths says we are meant to be using log(x), not 20*log10(x), | ||
279 | so we need to scale the phase to account for this: | ||
280 | log(x) = 20*log10(x)/scale */ | ||
281 | |||
282 | float scale = (20.0/logf(10.0)); | ||
283 | |||
284 | for(i=0; i<Ns; i++) { | ||
285 | phase[i] = Cf[i].imag/scale; | ||
286 | } | ||
287 | |||
288 | |||
289 | } | ||