00001
00002
00003 #include "STTypes.h"
00004 #include "TDStretch.h"
00005 #include "FIRFilter.h"
00006 #include "inttypes.h"
00007
00008 using namespace soundtouch;
00009
00010 double TDStretchSSE3::calcCrossCorrMulti(const float *mPos, const float *cPos) const
00011 {
00012 double corr = 0;
00013 int count = overlapLength * channels;
00014 int loops = count >> 4;
00015 int i = loops << 4;
00016 const float *mp = mPos;
00017 const float *cp = cPos;
00018
00019 __asm__ volatile (
00020 "xorpd %%xmm7, %%xmm7 \n\t"
00021 "1: \n\t"
00022 "movups (%1), %%xmm0 \n\t"
00023 "movups 16(%1), %%xmm1 \n\t"
00024 "mulps (%2), %%xmm0 \n\t"
00025 "movups 32(%1), %%xmm2 \n\t"
00026 "addps %%xmm0, %%xmm7 \n\t"
00027 "mulps 16(%2), %%xmm1 \n\t"
00028 "movups 48(%1), %%xmm3 \n\t"
00029 "mulps 32(%2), %%xmm2 \n\t"
00030 "addps %%xmm1, %%xmm7 \n\t"
00031 "mulps 48(%2), %%xmm3 \n\t"
00032 "addps %%xmm2, %%xmm7 \n\t"
00033 "add $64, %1 \n\t"
00034 "add $64, %2 \n\t"
00035 "addps %%xmm3, %%xmm7 \n\t"
00036 "sub $1, %%ecx \n\t"
00037 "jnz 1b \n\t"
00038 "haddps %%xmm7, %%xmm7 \n\t"
00039 "cvtps2pd %%xmm7, %%xmm7 \n\t"
00040 "haddpd %%xmm7, %%xmm7 \n\t"
00041 "movsd %%xmm7, %0 \n\t"
00042 :"=m"(corr),"+r"(mp), "+r"(cp)
00043 :"c"(loops)
00044 );
00045
00046 for (; i < count; i++)
00047 corr += *mp++ * *cp++;
00048
00049 return corr;
00050 }
00051
00052 double TDStretchSSE2::calcCrossCorrMulti(const float *mPos, const float *cPos) const
00053 {
00054 double corr = 0;
00055 int count = overlapLength * channels;
00056 int loops = count >> 4;
00057 int i = loops << 4;
00058 const float *mp = mPos;
00059 const float *cp = cPos;
00060
00061 __asm__ volatile (
00062 "xorpd %%xmm7, %%xmm7 \n\t"
00063 "1: \n\t"
00064 "movups (%1), %%xmm0 \n\t"
00065 "movups 16(%1), %%xmm1 \n\t"
00066 "mulps (%2), %%xmm0 \n\t"
00067 "movups 32(%1), %%xmm2 \n\t"
00068 "addps %%xmm0, %%xmm7 \n\t"
00069 "mulps 16(%2), %%xmm1 \n\t"
00070 "movups 48(%1), %%xmm3 \n\t"
00071 "mulps 32(%2), %%xmm2 \n\t"
00072 "addps %%xmm1, %%xmm7 \n\t"
00073 "mulps 48(%2), %%xmm3 \n\t"
00074 "addps %%xmm2, %%xmm7 \n\t"
00075 "add $64, %1 \n\t"
00076 "add $64, %2 \n\t"
00077 "addps %%xmm3, %%xmm7 \n\t"
00078 "sub $1, %%ecx \n\t"
00079 "jnz 1b \n\t"
00080 "movaps %%xmm7, %%xmm6 \n\t"
00081 "shufps $0x4e, %%xmm7, %%xmm6 \n\t"
00082 "addps %%xmm6, %%xmm7 \n\t"
00083 "cvtps2pd %%xmm7, %%xmm7 \n\t"
00084 "movapd %%xmm7, %%xmm6 \n\t"
00085 "shufpd $0x01, %%xmm7, %%xmm6 \n\t"
00086 "addpd %%xmm6, %%xmm7 \n\t"
00087 "movsd %%xmm7, %0 \n\t"
00088 :"=m"(corr),"+r"(mp), "+r"(cp)
00089 :"c"(loops)
00090 );
00091
00092 for (; i < count; i++)
00093 corr += *mp++ * *cp++;
00094
00095 return corr;
00096 }
00097
00098 double TDStretchSSE3::calcCrossCorrStereo(const float *mPos, const float *cPos) const
00099 {
00100 double corr = 0;
00101 int count = overlapLength <<1;
00102 int loops = count >> 4;
00103 int i = loops << 4;
00104 const float *mp = mPos;
00105 const float *cp = cPos;
00106
00107 __asm__ volatile (
00108 "xorpd %%xmm7, %%xmm7 \n\t"
00109 "1: \n\t"
00110 "movups (%1), %%xmm0 \n\t"
00111 "movups 16(%1), %%xmm1 \n\t"
00112 "mulps (%2), %%xmm0 \n\t"
00113 "movups 32(%1), %%xmm2 \n\t"
00114 "addps %%xmm0, %%xmm7 \n\t"
00115 "mulps 16(%2), %%xmm1 \n\t"
00116 "movups 48(%1), %%xmm3 \n\t"
00117 "mulps 32(%2), %%xmm2 \n\t"
00118 "addps %%xmm1, %%xmm7 \n\t"
00119 "mulps 48(%2), %%xmm3 \n\t"
00120 "addps %%xmm2, %%xmm7 \n\t"
00121 "add $64, %1 \n\t"
00122 "add $64, %2 \n\t"
00123 "addps %%xmm3, %%xmm7 \n\t"
00124 "sub $1, %%ecx \n\t"
00125 "jnz 1b \n\t"
00126 "haddps %%xmm7, %%xmm7 \n\t"
00127 "cvtps2pd %%xmm7, %%xmm7 \n\t"
00128 "haddpd %%xmm7, %%xmm7 \n\t"
00129 "movsd %%xmm7, %0 \n\t"
00130 :"=m"(corr),"+r"(mp), "+r"(cp)
00131 :"c"(loops)
00132 );
00133
00134 for (; i < count; i += 2)
00135 corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
00136
00137 return corr;
00138 }
00139
00140 double TDStretchSSE2::calcCrossCorrStereo(const float *mPos, const float *cPos) const
00141 {
00142 double corr = 0;
00143 int count = overlapLength <<1;
00144 int loops = count >> 4;
00145 int i = loops << 4;
00146 const float *mp = mPos;
00147 const float *cp = cPos;
00148
00149 __asm__ volatile (
00150 "xorpd %%xmm7, %%xmm7 \n\t"
00151 "1: \n\t"
00152 "movups (%1), %%xmm0 \n\t"
00153 "movups 16(%1), %%xmm1 \n\t"
00154 "mulps (%2), %%xmm0 \n\t"
00155 "movups 32(%1), %%xmm2 \n\t"
00156 "addps %%xmm0, %%xmm7 \n\t"
00157 "mulps 16(%2), %%xmm1 \n\t"
00158 "movups 48(%1), %%xmm3 \n\t"
00159 "mulps 32(%2), %%xmm2 \n\t"
00160 "addps %%xmm1, %%xmm7 \n\t"
00161 "mulps 48(%2), %%xmm3 \n\t"
00162 "addps %%xmm2, %%xmm7 \n\t"
00163 "add $64, %1 \n\t"
00164 "add $64, %2 \n\t"
00165 "addps %%xmm3, %%xmm7 \n\t"
00166 "sub $1, %%ecx \n\t"
00167 "jnz 1b \n\t"
00168 "movaps %%xmm7, %%xmm6 \n\t"
00169 "shufps $0x4e, %%xmm7, %%xmm6 \n\t"
00170 "addps %%xmm6, %%xmm7 \n\t"
00171 "cvtps2pd %%xmm7, %%xmm7 \n\t"
00172 "movapd %%xmm7, %%xmm6 \n\t"
00173 "shufpd $0x01, %%xmm7, %%xmm6 \n\t"
00174 "addpd %%xmm6, %%xmm7 \n\t"
00175 "movsd %%xmm7, %0 \n\t"
00176 :"=m"(corr),"+r"(mp), "+r"(cp)
00177 :"c"(loops)
00178 );
00179
00180 for (; i < count; i += 2)
00181 corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
00182
00183 return corr;
00184 }
00185
00186 void TDStretchSSE2::overlapMulti(float *output, const float *input) const
00187 {
00188
00189 float *o = output;
00190 const float *i = input;
00191 const float *m = pMidBuffer;
00192
00193 if (channels > 4)
00194 __asm__ volatile (
00195 "cvtsi2ss %%ecx, %%xmm7 \n\t"
00196 "shl $2, %4 \n\t"
00197 "punpckldq %%xmm7, %%xmm7 \n\t"
00198 "xorpd %%xmm6, %%xmm6 \n\t"
00199 "punpckldq %%xmm7, %%xmm7 \n\t"
00200 "rcpps %%xmm7, %%xmm1 \n\t"
00201 "mulps %%xmm1, %%xmm7 \n\t"
00202 "1: \n\t"
00203 "movups (%1), %%xmm2 \n\t"
00204 "movups 16(%1), %%xmm4 \n\t"
00205 "mulps %%xmm6, %%xmm2 \n\t"
00206 "movups (%2), %%xmm3 \n\t"
00207 "movups 16(%2), %%xmm5 \n\t"
00208 "mulps %%xmm7, %%xmm3 \n\t"
00209 "add %4, %1 \n\t"
00210 "mulps %%xmm6, %%xmm4 \n\t"
00211 "addps %%xmm2, %%xmm3 \n\t"
00212 "mulps %%xmm7, %%xmm5 \n\t"
00213 "movups %%xmm3, (%3) \n\t"
00214 "addps %%xmm4, %%xmm5 \n\t"
00215 "add %4, %2 \n\t"
00216 "movups %%xmm5, 16(%3) \n\t"
00217 "addps %%xmm1, %%xmm6 \n\t"
00218 "add %4, %3 \n\t"
00219 "subps %%xmm1, %%xmm7 \n\t"
00220 "sub $1, %%ecx \n\t"
00221 "jnz 1b \n\t"
00222 :
00223 :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
00224 );
00225 else
00226 __asm__ volatile (
00227 "cvtsi2ss %%ecx, %%xmm7 \n\t"
00228 "shl $2, %4 \n\t"
00229 "shr %%ecx \n\t"
00230 "punpckldq %%xmm7, %%xmm7 \n\t"
00231 "xorpd %%xmm6, %%xmm6 \n\t"
00232 "punpckldq %%xmm7, %%xmm7 \n\t"
00233 "rcpps %%xmm7, %%xmm1 \n\t"
00234 "mulps %%xmm1, %%xmm7 \n\t"
00235 "1: \n\t"
00236 "movups (%1), %%xmm2 \n\t"
00237 "movups 16(%1), %%xmm4 \n\t"
00238 "mulps %%xmm6, %%xmm2 \n\t"
00239 "movups (%2), %%xmm3 \n\t"
00240 "movups 16(%2), %%xmm5 \n\t"
00241 "mulps %%xmm7, %%xmm3 \n\t"
00242 "addps %%xmm1, %%xmm6 \n\t"
00243 "add %4, %1 \n\t"
00244 "addps %%xmm2, %%xmm3 \n\t"
00245 "add %4, %2 \n\t"
00246 "subps %%xmm1, %%xmm7 \n\t"
00247 "movups %%xmm3, (%3) \n\t"
00248 "add %4, %3 \n\t"
00249 "mulps %%xmm6, %%xmm4 \n\t"
00250 "add %4, %1 \n\t"
00251 "mulps %%xmm7, %%xmm5 \n\t"
00252 "addps %%xmm1, %%xmm6 \n\t"
00253 "add %4, %2 \n\t"
00254 "addps %%xmm4, %%xmm5 \n\t"
00255 "subps %%xmm1, %%xmm7 \n\t"
00256 "movups %%xmm5, (%3) \n\t"
00257 "add %4, %3 \n\t"
00258 "sub $1, %%ecx \n\t"
00259 "jnz 1b \n\t"
00260 :
00261 :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
00262 );
00263 }
00264
00265 void TDStretchSSE2::overlapStereo(float *output, const float *input) const
00266 {
00267 float *o = output;
00268 const float *i = input;
00269 const float *m = pMidBuffer;
00270
00271 __asm__ volatile (
00272 "cvtsi2ss %%ecx, %%xmm7 \n\t"
00273 "shr %%ecx \n\t"
00274 "xorpd %%xmm6, %%xmm6 \n\t"
00275 "punpckldq %%xmm7, %%xmm7 \n\t"
00276 "rcpps %%xmm7, %%xmm1 \n\t"
00277 "mulps %%xmm1, %%xmm7 \n\t"
00278 "1: \n\t"
00279 "movups (%1), %%xmm2 \n\t"
00280 "movups 8(%1), %%xmm4 \n\t"
00281 "mulps %%xmm6, %%xmm2 \n\t"
00282 "movups (%2), %%xmm3 \n\t"
00283 "movups 8(%2), %%xmm5 \n\t"
00284 "mulps %%xmm7, %%xmm3 \n\t"
00285 "addps %%xmm1, %%xmm6 \n\t"
00286 "addps %%xmm2, %%xmm3 \n\t"
00287 "subps %%xmm1, %%xmm7 \n\t"
00288 "movlps %%xmm3, (%3) \n\t"
00289 "add $8, %3 \n\t"
00290 "mulps %%xmm6, %%xmm4 \n\t"
00291 "add $16, %1 \n\t"
00292 "mulps %%xmm7, %%xmm5 \n\t"
00293 "addps %%xmm1, %%xmm6 \n\t"
00294 "add $16, %2 \n\t"
00295 "addps %%xmm4, %%xmm5 \n\t"
00296 "subps %%xmm1, %%xmm7 \n\t"
00297 "movlps %%xmm5, (%3) \n\t"
00298 "add $8, %3 \n\t"
00299 "sub $1, %%ecx \n\t"
00300 "jnz 1b \n\t"
00301 :
00302 :"c"(overlapLength),"r"(i),"r"(m),"r"(o)
00303 );
00304 }
00305
00306 FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
00307 {
00308 filterCoeffsAlign = NULL;
00309 filterCoeffsUnalign = NULL;
00310 }
00311
00312 FIRFilterSSE2::~FIRFilterSSE2()
00313 {
00314 delete[] filterCoeffsUnalign;
00315 filterCoeffsAlign = NULL;
00316 filterCoeffsUnalign = NULL;
00317 }
00318
00319
00320 void FIRFilterSSE2::setCoefficients(const float *coeffs, uint newLen, uint uRDF)
00321 {
00322 uint i;
00323 FIRFilter::setCoefficients(coeffs, newLen, uRDF);
00324
00325
00326 delete[] filterCoeffsUnalign;
00327 filterCoeffsUnalign = new float[2 * newLen + 16];
00328 filterCoeffsAlign = (float *)(((ulong)filterCoeffsUnalign + 15) & -16);
00329
00330 float fdiv = (float)resultDivider;
00331
00332 for (i = 0; i < newLen; i++)
00333 {
00334 filterCoeffsAlign[2 * i + 0] =
00335 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fdiv;
00336 }
00337 }
00338
00339 uint FIRFilterSSE2::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
00340 {
00341 uint count = (numSamples - length) & -2;
00342
00343 for (int i = 0; i < count; i += 2)
00344 {
00345 __asm__ volatile(
00346 "xorpd %%xmm6, %%xmm6 \n\t"
00347 "xorpd %%xmm7, %%xmm7 \n\t"
00348 "1: \n\t"
00349 "movups (%1), %%xmm1 \n\t"
00350 "movups 8(%1), %%xmm2 \n\t"
00351 "mulps (%2), %%xmm1 \n\t"
00352 "movups 16(%1), %%xmm3 \n\t"
00353 "mulps (%2), %%xmm2 \n\t"
00354 "addps %%xmm1, %%xmm6 \n\t"
00355 "movups 24(%1), %%xmm4 \n\t"
00356 "addps %%xmm2, %%xmm7 \n\t"
00357 "mulps 16(%2), %%xmm3 \n\t"
00358 "movups 32(%1), %%xmm1 \n\t"
00359 "mulps 16(%2), %%xmm4 \n\t"
00360 "addps %%xmm3, %%xmm6 \n\t"
00361 "movups 40(%1), %%xmm2 \n\t"
00362 "addps %%xmm4, %%xmm7 \n\t"
00363 "mulps 32(%2), %%xmm1 \n\t"
00364 "movups 48(%1), %%xmm3 \n\t"
00365 "mulps 32(%2), %%xmm2 \n\t"
00366 "addps %%xmm1, %%xmm6 \n\t"
00367 "movups 56(%1), %%xmm4 \n\t"
00368 "addps %%xmm2, %%xmm7 \n\t"
00369 "mulps 48(%2), %%xmm3 \n\t"
00370 "add $64, %1 \n\t"
00371 "mulps 48(%2), %%xmm4 \n\t"
00372 "addps %%xmm3, %%xmm6 \n\t"
00373 "add $64, %2 \n\t"
00374 "addps %%xmm4, %%xmm7 \n\t"
00375 "sub $1, %%ecx \n\t"
00376 "jnz 1b \n\t"
00377 "movhlps %%xmm6, %%xmm0 \n\t"
00378 "movlhps %%xmm7, %%xmm0 \n\t"
00379 "shufps $0xe4, %%xmm7, %%xmm6 \n\t"
00380 "addps %%xmm0, %%xmm6 \n\t"
00381 "movups %%xmm6, (%0) \n\t"
00382 :
00383 :"r"(dest),"r"(src),"r"(filterCoeffsAlign),"c"(length>>3)
00384 );
00385 src += 4;
00386 dest += 4;
00387 }
00388
00389 return count;
00390 }