00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027
00028
00029 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00030 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
00031 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
00032 {((8 - x) * (8 - y)),
00033 ((x) * (8 - y)),
00034 ((8 - x) * (y)),
00035 ((x) * (y))};
00036 register int i;
00037 vec_u8_t fperm;
00038 const vec_s32_t vABCD = vec_ld(0, ABCD);
00039 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
00040 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
00041 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
00042 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
00043 LOAD_ZERO;
00044 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00045 const vec_u16_t v6us = vec_splat_u16(6);
00046 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00047 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00048
00049 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
00050 vec_u8_t vsrc0uc, vsrc1uc;
00051 vec_s16_t vsrc0ssH, vsrc1ssH;
00052 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
00053 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
00054 vec_u8_t vdst, ppsum, vfdst, fsum;
00055
00056 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00057
00058 if (((unsigned long)dst) % 16 == 0) {
00059 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
00060 0x14, 0x15, 0x16, 0x17,
00061 0x08, 0x09, 0x0A, 0x0B,
00062 0x0C, 0x0D, 0x0E, 0x0F);
00063 } else {
00064 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
00065 0x04, 0x05, 0x06, 0x07,
00066 0x18, 0x19, 0x1A, 0x1B,
00067 0x1C, 0x1D, 0x1E, 0x1F);
00068 }
00069
00070 vsrcAuc = vec_ld(0, src);
00071
00072 if (loadSecond)
00073 vsrcBuc = vec_ld(16, src);
00074 vsrcperm0 = vec_lvsl(0, src);
00075 vsrcperm1 = vec_lvsl(1, src);
00076
00077 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00078 if (reallyBadAlign)
00079 vsrc1uc = vsrcBuc;
00080 else
00081 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00082
00083 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
00084 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
00085
00086 if (!loadSecond) {
00087 for (i = 0 ; i < h ; i++) {
00088
00089
00090 vsrcCuc = vec_ld(stride + 0, src);
00091
00092 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00093 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00094
00095 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
00096 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
00097
00098 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
00099 psum = vec_mladd(vB, vsrc1ssH, psum);
00100 psum = vec_mladd(vC, vsrc2ssH, psum);
00101 psum = vec_mladd(vD, vsrc3ssH, psum);
00102 psum = vec_add(v32ss, psum);
00103 psum = vec_sra(psum, v6us);
00104
00105 vdst = vec_ld(0, dst);
00106 ppsum = (vec_u8_t)vec_packsu(psum, psum);
00107 vfdst = vec_perm(vdst, ppsum, fperm);
00108
00109 OP_U8_ALTIVEC(fsum, vfdst, vdst);
00110
00111 vec_st(fsum, 0, dst);
00112
00113 vsrc0ssH = vsrc2ssH;
00114 vsrc1ssH = vsrc3ssH;
00115
00116 dst += stride;
00117 src += stride;
00118 }
00119 } else {
00120 vec_u8_t vsrcDuc;
00121 for (i = 0 ; i < h ; i++) {
00122 vsrcCuc = vec_ld(stride + 0, src);
00123 vsrcDuc = vec_ld(stride + 16, src);
00124
00125 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00126 if (reallyBadAlign)
00127 vsrc3uc = vsrcDuc;
00128 else
00129 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00130
00131 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
00132 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
00133
00134 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
00135 psum = vec_mladd(vB, vsrc1ssH, psum);
00136 psum = vec_mladd(vC, vsrc2ssH, psum);
00137 psum = vec_mladd(vD, vsrc3ssH, psum);
00138 psum = vec_add(v32ss, psum);
00139 psum = vec_sr(psum, v6us);
00140
00141 vdst = vec_ld(0, dst);
00142 ppsum = (vec_u8_t)vec_pack(psum, psum);
00143 vfdst = vec_perm(vdst, ppsum, fperm);
00144
00145 OP_U8_ALTIVEC(fsum, vfdst, vdst);
00146
00147 vec_st(fsum, 0, dst);
00148
00149 vsrc0ssH = vsrc2ssH;
00150 vsrc1ssH = vsrc3ssH;
00151
00152 dst += stride;
00153 src += stride;
00154 }
00155 }
00156 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00157 }
00158
00159
00160 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00161 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
00162 register int i;
00163
00164 LOAD_ZERO;
00165 const vec_u8_t permM2 = vec_lvsl(-2, src);
00166 const vec_u8_t permM1 = vec_lvsl(-1, src);
00167 const vec_u8_t permP0 = vec_lvsl(+0, src);
00168 const vec_u8_t permP1 = vec_lvsl(+1, src);
00169 const vec_u8_t permP2 = vec_lvsl(+2, src);
00170 const vec_u8_t permP3 = vec_lvsl(+3, src);
00171 const vec_s16_t v5ss = vec_splat_s16(5);
00172 const vec_u16_t v5us = vec_splat_u16(5);
00173 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00174 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00175
00176 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00177
00178 register int align = ((((unsigned long)src) - 2) % 16);
00179
00180 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
00181 srcP2A, srcP2B, srcP3A, srcP3B,
00182 srcM1A, srcM1B, srcM2A, srcM2B,
00183 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00184 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00185 psumA, psumB, sumA, sumB;
00186
00187 vec_u8_t sum, vdst, fsum;
00188
00189 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00190
00191 for (i = 0 ; i < 16 ; i ++) {
00192 vec_u8_t srcR1 = vec_ld(-2, src);
00193 vec_u8_t srcR2 = vec_ld(14, src);
00194
00195 switch (align) {
00196 default: {
00197 srcM2 = vec_perm(srcR1, srcR2, permM2);
00198 srcM1 = vec_perm(srcR1, srcR2, permM1);
00199 srcP0 = vec_perm(srcR1, srcR2, permP0);
00200 srcP1 = vec_perm(srcR1, srcR2, permP1);
00201 srcP2 = vec_perm(srcR1, srcR2, permP2);
00202 srcP3 = vec_perm(srcR1, srcR2, permP3);
00203 } break;
00204 case 11: {
00205 srcM2 = vec_perm(srcR1, srcR2, permM2);
00206 srcM1 = vec_perm(srcR1, srcR2, permM1);
00207 srcP0 = vec_perm(srcR1, srcR2, permP0);
00208 srcP1 = vec_perm(srcR1, srcR2, permP1);
00209 srcP2 = vec_perm(srcR1, srcR2, permP2);
00210 srcP3 = srcR2;
00211 } break;
00212 case 12: {
00213 vec_u8_t srcR3 = vec_ld(30, src);
00214 srcM2 = vec_perm(srcR1, srcR2, permM2);
00215 srcM1 = vec_perm(srcR1, srcR2, permM1);
00216 srcP0 = vec_perm(srcR1, srcR2, permP0);
00217 srcP1 = vec_perm(srcR1, srcR2, permP1);
00218 srcP2 = srcR2;
00219 srcP3 = vec_perm(srcR2, srcR3, permP3);
00220 } break;
00221 case 13: {
00222 vec_u8_t srcR3 = vec_ld(30, src);
00223 srcM2 = vec_perm(srcR1, srcR2, permM2);
00224 srcM1 = vec_perm(srcR1, srcR2, permM1);
00225 srcP0 = vec_perm(srcR1, srcR2, permP0);
00226 srcP1 = srcR2;
00227 srcP2 = vec_perm(srcR2, srcR3, permP2);
00228 srcP3 = vec_perm(srcR2, srcR3, permP3);
00229 } break;
00230 case 14: {
00231 vec_u8_t srcR3 = vec_ld(30, src);
00232 srcM2 = vec_perm(srcR1, srcR2, permM2);
00233 srcM1 = vec_perm(srcR1, srcR2, permM1);
00234 srcP0 = srcR2;
00235 srcP1 = vec_perm(srcR2, srcR3, permP1);
00236 srcP2 = vec_perm(srcR2, srcR3, permP2);
00237 srcP3 = vec_perm(srcR2, srcR3, permP3);
00238 } break;
00239 case 15: {
00240 vec_u8_t srcR3 = vec_ld(30, src);
00241 srcM2 = vec_perm(srcR1, srcR2, permM2);
00242 srcM1 = srcR2;
00243 srcP0 = vec_perm(srcR2, srcR3, permP0);
00244 srcP1 = vec_perm(srcR2, srcR3, permP1);
00245 srcP2 = vec_perm(srcR2, srcR3, permP2);
00246 srcP3 = vec_perm(srcR2, srcR3, permP3);
00247 } break;
00248 }
00249
00250 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
00251 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
00252 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
00253 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
00254
00255 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
00256 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
00257 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
00258 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
00259
00260 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
00261 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
00262 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
00263 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
00264
00265 sum1A = vec_adds(srcP0A, srcP1A);
00266 sum1B = vec_adds(srcP0B, srcP1B);
00267 sum2A = vec_adds(srcM1A, srcP2A);
00268 sum2B = vec_adds(srcM1B, srcP2B);
00269 sum3A = vec_adds(srcM2A, srcP3A);
00270 sum3B = vec_adds(srcM2B, srcP3B);
00271
00272 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00273 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00274
00275 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00276 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00277
00278 pp3A = vec_add(sum3A, pp1A);
00279 pp3B = vec_add(sum3B, pp1B);
00280
00281 psumA = vec_sub(pp3A, pp2A);
00282 psumB = vec_sub(pp3B, pp2B);
00283
00284 sumA = vec_sra(psumA, v5us);
00285 sumB = vec_sra(psumB, v5us);
00286
00287 sum = vec_packsu(sumA, sumB);
00288
00289 ASSERT_ALIGNED(dst);
00290 vdst = vec_ld(0, dst);
00291
00292 OP_U8_ALTIVEC(fsum, sum, vdst);
00293
00294 vec_st(fsum, 0, dst);
00295
00296 src += srcStride;
00297 dst += dstStride;
00298 }
00299 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00300 }
00301
00302
00303 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00304 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
00305
00306 register int i;
00307
00308 LOAD_ZERO;
00309 const vec_u8_t perm = vec_lvsl(0, src);
00310 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00311 const vec_u16_t v5us = vec_splat_u16(5);
00312 const vec_s16_t v5ss = vec_splat_s16(5);
00313 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00314
00315 uint8_t *srcbis = src - (srcStride * 2);
00316
00317 const vec_u8_t srcM2a = vec_ld(0, srcbis);
00318 const vec_u8_t srcM2b = vec_ld(16, srcbis);
00319 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
00320
00321 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
00322 const vec_u8_t srcM1b = vec_ld(16, srcbis);
00323 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
00324
00325 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
00326 const vec_u8_t srcP0b = vec_ld(16, srcbis);
00327 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
00328
00329 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
00330 const vec_u8_t srcP1b = vec_ld(16, srcbis);
00331 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
00332
00333 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
00334 const vec_u8_t srcP2b = vec_ld(16, srcbis);
00335 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
00336
00337
00338 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
00339 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
00340 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
00341 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
00342 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
00343 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
00344 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
00345 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
00346 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
00347 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
00348
00349 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00350 psumA, psumB, sumA, sumB,
00351 srcP3ssA, srcP3ssB,
00352 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00353
00354 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00355
00356 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00357
00358 for (i = 0 ; i < 16 ; i++) {
00359 srcP3a = vec_ld(0, srcbis += srcStride);
00360 srcP3b = vec_ld(16, srcbis);
00361 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00362 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
00363 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
00364
00365
00366 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00367 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00368 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00369 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00370 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00371 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00372
00373 srcM2ssA = srcM1ssA;
00374 srcM2ssB = srcM1ssB;
00375 srcM1ssA = srcP0ssA;
00376 srcM1ssB = srcP0ssB;
00377 srcP0ssA = srcP1ssA;
00378 srcP0ssB = srcP1ssB;
00379 srcP1ssA = srcP2ssA;
00380 srcP1ssB = srcP2ssB;
00381 srcP2ssA = srcP3ssA;
00382 srcP2ssB = srcP3ssB;
00383
00384 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00385 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00386
00387 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00388 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00389
00390 pp3A = vec_add(sum3A, pp1A);
00391 pp3B = vec_add(sum3B, pp1B);
00392
00393 psumA = vec_sub(pp3A, pp2A);
00394 psumB = vec_sub(pp3B, pp2B);
00395
00396 sumA = vec_sra(psumA, v5us);
00397 sumB = vec_sra(psumB, v5us);
00398
00399 sum = vec_packsu(sumA, sumB);
00400
00401 ASSERT_ALIGNED(dst);
00402 vdst = vec_ld(0, dst);
00403
00404 OP_U8_ALTIVEC(fsum, sum, vdst);
00405
00406 vec_st(fsum, 0, dst);
00407
00408 dst += dstStride;
00409 }
00410 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00411 }
00412
00413
00414 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00415 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00416 register int i;
00417 LOAD_ZERO;
00418 const vec_u8_t permM2 = vec_lvsl(-2, src);
00419 const vec_u8_t permM1 = vec_lvsl(-1, src);
00420 const vec_u8_t permP0 = vec_lvsl(+0, src);
00421 const vec_u8_t permP1 = vec_lvsl(+1, src);
00422 const vec_u8_t permP2 = vec_lvsl(+2, src);
00423 const vec_u8_t permP3 = vec_lvsl(+3, src);
00424 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00425 const vec_u32_t v10ui = vec_splat_u32(10);
00426 const vec_s16_t v5ss = vec_splat_s16(5);
00427 const vec_s16_t v1ss = vec_splat_s16(1);
00428 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00429 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00430
00431 register int align = ((((unsigned long)src) - 2) % 16);
00432
00433 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
00434 srcP2A, srcP2B, srcP3A, srcP3B,
00435 srcM1A, srcM1B, srcM2A, srcM2B,
00436 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00437 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00438
00439 const vec_u8_t mperm = (const vec_u8_t)
00440 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00441 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
00442 int16_t *tmpbis = tmp;
00443
00444 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00445 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00446 tmpP2ssA, tmpP2ssB;
00447
00448 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00449 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00450 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00451 ssumAe, ssumAo, ssumBe, ssumBo;
00452 vec_u8_t fsum, sumv, sum, vdst;
00453 vec_s16_t ssume, ssumo;
00454
00455 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00456 src -= (2 * srcStride);
00457 for (i = 0 ; i < 21 ; i ++) {
00458 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00459 vec_u8_t srcR1 = vec_ld(-2, src);
00460 vec_u8_t srcR2 = vec_ld(14, src);
00461
00462 switch (align) {
00463 default: {
00464 srcM2 = vec_perm(srcR1, srcR2, permM2);
00465 srcM1 = vec_perm(srcR1, srcR2, permM1);
00466 srcP0 = vec_perm(srcR1, srcR2, permP0);
00467 srcP1 = vec_perm(srcR1, srcR2, permP1);
00468 srcP2 = vec_perm(srcR1, srcR2, permP2);
00469 srcP3 = vec_perm(srcR1, srcR2, permP3);
00470 } break;
00471 case 11: {
00472 srcM2 = vec_perm(srcR1, srcR2, permM2);
00473 srcM1 = vec_perm(srcR1, srcR2, permM1);
00474 srcP0 = vec_perm(srcR1, srcR2, permP0);
00475 srcP1 = vec_perm(srcR1, srcR2, permP1);
00476 srcP2 = vec_perm(srcR1, srcR2, permP2);
00477 srcP3 = srcR2;
00478 } break;
00479 case 12: {
00480 vec_u8_t srcR3 = vec_ld(30, src);
00481 srcM2 = vec_perm(srcR1, srcR2, permM2);
00482 srcM1 = vec_perm(srcR1, srcR2, permM1);
00483 srcP0 = vec_perm(srcR1, srcR2, permP0);
00484 srcP1 = vec_perm(srcR1, srcR2, permP1);
00485 srcP2 = srcR2;
00486 srcP3 = vec_perm(srcR2, srcR3, permP3);
00487 } break;
00488 case 13: {
00489 vec_u8_t srcR3 = vec_ld(30, src);
00490 srcM2 = vec_perm(srcR1, srcR2, permM2);
00491 srcM1 = vec_perm(srcR1, srcR2, permM1);
00492 srcP0 = vec_perm(srcR1, srcR2, permP0);
00493 srcP1 = srcR2;
00494 srcP2 = vec_perm(srcR2, srcR3, permP2);
00495 srcP3 = vec_perm(srcR2, srcR3, permP3);
00496 } break;
00497 case 14: {
00498 vec_u8_t srcR3 = vec_ld(30, src);
00499 srcM2 = vec_perm(srcR1, srcR2, permM2);
00500 srcM1 = vec_perm(srcR1, srcR2, permM1);
00501 srcP0 = srcR2;
00502 srcP1 = vec_perm(srcR2, srcR3, permP1);
00503 srcP2 = vec_perm(srcR2, srcR3, permP2);
00504 srcP3 = vec_perm(srcR2, srcR3, permP3);
00505 } break;
00506 case 15: {
00507 vec_u8_t srcR3 = vec_ld(30, src);
00508 srcM2 = vec_perm(srcR1, srcR2, permM2);
00509 srcM1 = srcR2;
00510 srcP0 = vec_perm(srcR2, srcR3, permP0);
00511 srcP1 = vec_perm(srcR2, srcR3, permP1);
00512 srcP2 = vec_perm(srcR2, srcR3, permP2);
00513 srcP3 = vec_perm(srcR2, srcR3, permP3);
00514 } break;
00515 }
00516
00517 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
00518 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
00519 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
00520 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
00521
00522 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
00523 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
00524 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
00525 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
00526
00527 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
00528 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
00529 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
00530 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
00531
00532 sum1A = vec_adds(srcP0A, srcP1A);
00533 sum1B = vec_adds(srcP0B, srcP1B);
00534 sum2A = vec_adds(srcM1A, srcP2A);
00535 sum2B = vec_adds(srcM1B, srcP2B);
00536 sum3A = vec_adds(srcM2A, srcP3A);
00537 sum3B = vec_adds(srcM2B, srcP3B);
00538
00539 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00540 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00541
00542 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00543 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00544
00545 psumA = vec_sub(pp1A, pp2A);
00546 psumB = vec_sub(pp1B, pp2B);
00547
00548 vec_st(psumA, 0, tmp);
00549 vec_st(psumB, 16, tmp);
00550
00551 src += srcStride;
00552 tmp += tmpStride;
00553 }
00554
00555 tmpM2ssA = vec_ld(0, tmpbis);
00556 tmpM2ssB = vec_ld(16, tmpbis);
00557 tmpbis += tmpStride;
00558 tmpM1ssA = vec_ld(0, tmpbis);
00559 tmpM1ssB = vec_ld(16, tmpbis);
00560 tmpbis += tmpStride;
00561 tmpP0ssA = vec_ld(0, tmpbis);
00562 tmpP0ssB = vec_ld(16, tmpbis);
00563 tmpbis += tmpStride;
00564 tmpP1ssA = vec_ld(0, tmpbis);
00565 tmpP1ssB = vec_ld(16, tmpbis);
00566 tmpbis += tmpStride;
00567 tmpP2ssA = vec_ld(0, tmpbis);
00568 tmpP2ssB = vec_ld(16, tmpbis);
00569 tmpbis += tmpStride;
00570
00571 for (i = 0 ; i < 16 ; i++) {
00572 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
00573 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
00574
00575 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00576 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00577 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00578 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00579 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00580 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00581
00582 tmpbis += tmpStride;
00583
00584 tmpM2ssA = tmpM1ssA;
00585 tmpM2ssB = tmpM1ssB;
00586 tmpM1ssA = tmpP0ssA;
00587 tmpM1ssB = tmpP0ssB;
00588 tmpP0ssA = tmpP1ssA;
00589 tmpP0ssB = tmpP1ssB;
00590 tmpP1ssA = tmpP2ssA;
00591 tmpP1ssB = tmpP2ssB;
00592 tmpP2ssA = tmpP3ssA;
00593 tmpP2ssB = tmpP3ssB;
00594
00595 pp1Ae = vec_mule(sum1A, v20ss);
00596 pp1Ao = vec_mulo(sum1A, v20ss);
00597 pp1Be = vec_mule(sum1B, v20ss);
00598 pp1Bo = vec_mulo(sum1B, v20ss);
00599
00600 pp2Ae = vec_mule(sum2A, v5ss);
00601 pp2Ao = vec_mulo(sum2A, v5ss);
00602 pp2Be = vec_mule(sum2B, v5ss);
00603 pp2Bo = vec_mulo(sum2B, v5ss);
00604
00605 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
00606 pp3Ao = vec_mulo(sum3A, v1ss);
00607 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
00608 pp3Bo = vec_mulo(sum3B, v1ss);
00609
00610 pp1cAe = vec_add(pp1Ae, v512si);
00611 pp1cAo = vec_add(pp1Ao, v512si);
00612 pp1cBe = vec_add(pp1Be, v512si);
00613 pp1cBo = vec_add(pp1Bo, v512si);
00614
00615 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00616 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00617 pp32Be = vec_sub(pp3Be, pp2Be);
00618 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00619
00620 sumAe = vec_add(pp1cAe, pp32Ae);
00621 sumAo = vec_add(pp1cAo, pp32Ao);
00622 sumBe = vec_add(pp1cBe, pp32Be);
00623 sumBo = vec_add(pp1cBo, pp32Bo);
00624
00625 ssumAe = vec_sra(sumAe, v10ui);
00626 ssumAo = vec_sra(sumAo, v10ui);
00627 ssumBe = vec_sra(sumBe, v10ui);
00628 ssumBo = vec_sra(sumBo, v10ui);
00629
00630 ssume = vec_packs(ssumAe, ssumBe);
00631 ssumo = vec_packs(ssumAo, ssumBo);
00632
00633 sumv = vec_packsu(ssume, ssumo);
00634 sum = vec_perm(sumv, sumv, mperm);
00635
00636 ASSERT_ALIGNED(dst);
00637 vdst = vec_ld(0, dst);
00638
00639 OP_U8_ALTIVEC(fsum, sum, vdst);
00640
00641 vec_st(fsum, 0, dst);
00642
00643 dst += dstStride;
00644 }
00645 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00646 }