00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <cstdio>
00027 #include <cstdlib>
00028 #include <algorithm>
00029 #include <inttypes.h>
00030 #include <limits.h>
00031 #include "mythconfig.h"
00032 #include "mythtvexp.h"
00033
00034 #if HAVE_MMX
00035 extern "C" {
00036 #include "ffmpeg-mmx.h"
00037 }
00038 #define CPU_MMXEXT 0
00039 #define CPU_MMX 1
00040 #endif
00041
00042 #if HAVE_ALTIVEC
00043 extern "C" {
00044 #include "libavutil/cpu.h"
00045 }
00046 int has_altivec(void);
00047 #if HAVE_ALTIVEC_H
00048 #include <altivec.h>
00049 #else
00050 #include <Accelerate/Accelerate.h>
00051 #endif
00052 #endif
00053 #include "yuv2rgb.h"
00054
00055 #if HAVE_ALTIVEC
00056 int has_altivec(void)
00057 {
00058 int cpu_flags = av_get_cpu_flags();
00059 if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
00060 return(1);
00061
00062 return(0);
00063 }
00064 #endif
00065
00071 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00072 unsigned char *pu, unsigned char *pv,
00073 int h_size, int v_size, int rgb_stride,
00074 int y_stride, int uv_stride, int alphaones)
00075 MUNUSED;
00076
00077
00078
00079 #define movntq(src,dest) \
00080 do { \
00081 if (cpu == CPU_MMXEXT) \
00082 movntq_r2m (src, dest); \
00083 else \
00084 movq_r2m (src, dest); \
00085 } while (0)
00086
00087 #if HAVE_MMX
00088 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
00089 {
00090 static mmx_t mmx_80w = {0x0080008000800080LL};
00091 static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
00092 static mmx_t mmx_U_blue = {0x4093409340934093LL};
00093 static mmx_t mmx_V_red = {0x3312331233123312LL};
00094 static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
00095 static mmx_t mmx_10w = {0x1010101010101010LL};
00096 static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
00097 static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
00098
00099 movd_m2r (*pu, mm0);
00100 movd_m2r (*pv, mm1);
00101 movq_m2r (*py, mm6);
00102 pxor_r2r (mm4, mm4);
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113 punpcklbw_r2r (mm4, mm0);
00114 punpcklbw_r2r (mm4, mm1);
00115 psubsw_m2r (mmx_80w, mm0);
00116 psubsw_m2r (mmx_80w, mm1);
00117 psllw_i2r (3, mm0);
00118 psllw_i2r (3, mm1);
00119 movq_r2r (mm0, mm2);
00120 movq_r2r (mm1, mm3);
00121 pmulhw_m2r (mmx_U_green, mm2);
00122 pmulhw_m2r (mmx_V_green, mm3);
00123 pmulhw_m2r (mmx_U_blue, mm0);
00124 pmulhw_m2r (mmx_V_red, mm1);
00125 paddsw_r2r (mm3, mm2);
00126
00127 psubusb_m2r (mmx_10w, mm6);
00128 movq_r2r (mm6, mm7);
00129 pand_m2r (mmx_00ffw, mm6);
00130 psrlw_i2r (8, mm7);
00131 psllw_i2r (3, mm6);
00132 psllw_i2r (3, mm7);
00133 pmulhw_m2r (mmx_Y_coeff, mm6);
00134 pmulhw_m2r (mmx_Y_coeff, mm7);
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144 movq_r2r (mm0, mm3);
00145 movq_r2r (mm1, mm4);
00146 movq_r2r (mm2, mm5);
00147 paddsw_r2r (mm6, mm0);
00148 paddsw_r2r (mm7, mm3);
00149 paddsw_r2r (mm6, mm1);
00150 paddsw_r2r (mm7, mm4);
00151 paddsw_r2r (mm6, mm2);
00152 paddsw_r2r (mm7, mm5);
00153 packuswb_r2r (mm0, mm0);
00154 packuswb_r2r (mm1, mm1);
00155 packuswb_r2r (mm2, mm2);
00156 packuswb_r2r (mm3, mm3);
00157 packuswb_r2r (mm4, mm4);
00158 packuswb_r2r (mm5, mm5);
00159 punpcklbw_r2r (mm3, mm0);
00160 punpcklbw_r2r (mm4, mm1);
00161 punpcklbw_r2r (mm5, mm2);
00162 }
00163
00164 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
00165 {
00166 static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
00167 static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
00168 static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
00169
00170
00171
00172
00173
00174
00175
00176
00177 pand_m2r (mmx_bluemask, mm0);
00178 pand_m2r (mmx_greenmask, mm2);
00179 pand_m2r (mmx_redmask, mm1);
00180 psrlq_i2r (3, mm0);
00181 pxor_r2r (mm4, mm4);
00182 movq_r2r (mm0, mm5);
00183 movq_r2r (mm2, mm7);
00184
00185 punpcklbw_r2r (mm4, mm2);
00186 punpcklbw_r2r (mm1, mm0);
00187 psllq_i2r (3, mm2);
00188 por_r2r (mm2, mm0);
00189 movntq (mm0, *image);
00190
00191 punpckhbw_r2r (mm4, mm7);
00192 punpckhbw_r2r (mm1, mm5);
00193 psllq_i2r (3, mm7);
00194 por_r2r (mm7, mm5);
00195 movntq (mm5, *(image+8));
00196 }
00197
00198 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones)
00199 {
00200
00201
00202
00203
00204
00205
00206
00207 if (alphaones)
00208 {
00209 static mmx_t mmx_1s = {0xffffffffffffffffLL};
00210 movq_m2r (mmx_1s, mm3);
00211 }
00212 else
00213 pxor_r2r (mm3, mm3);
00214
00215 movq_r2r (mm0, mm6);
00216 movq_r2r (mm1, mm7);
00217 movq_r2r (mm0, mm4);
00218 movq_r2r (mm1, mm5);
00219 punpcklbw_r2r (mm2, mm6);
00220 punpcklbw_r2r (mm3, mm7);
00221 punpcklwd_r2r (mm7, mm6);
00222 movntq (mm6, *image);
00223 movq_r2r (mm0, mm6);
00224 punpcklbw_r2r (mm2, mm6);
00225 punpckhwd_r2r (mm7, mm6);
00226 movntq (mm6, *(image+8));
00227 punpckhbw_r2r (mm2, mm4);
00228 punpckhbw_r2r (mm3, mm5);
00229 punpcklwd_r2r (mm5, mm4);
00230 movntq (mm4, *(image+16));
00231 movq_r2r (mm0, mm4);
00232 punpckhbw_r2r (mm2, mm4);
00233 punpckhwd_r2r (mm5, mm4);
00234 movntq (mm4, *(image+24));
00235 }
00236
00237 static inline void yuv420_rgb16 (uint8_t * image,
00238 uint8_t * py, uint8_t * pu, uint8_t * pv,
00239 int width, int height,
00240 int rgb_stride, int y_stride, int uv_stride,
00241 int cpu, int alphaones)
00242 {
00243 (void)alphaones;
00244 int i;
00245
00246 rgb_stride -= 2 * width;
00247 y_stride -= width;
00248 uv_stride -= width >> 1;
00249 width >>= 3;
00250
00251 do {
00252 i = width;
00253 do {
00254 mmx_yuv2rgb (py, pu, pv);
00255 mmx_unpack_16rgb (image, cpu);
00256 py += 8;
00257 pu += 4;
00258 pv += 4;
00259 image += 16;
00260 } while (--i);
00261
00262 py += y_stride;
00263 image += rgb_stride;
00264 if (height & 1) {
00265 pu += uv_stride;
00266 pv += uv_stride;
00267 } else {
00268 pu -= 4 * width;
00269 pv -= 4 * width;
00270 }
00271 } while (--height);
00272
00273 emms();
00274 }
00275
00276 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
00277 uint8_t * pu, uint8_t * pv,
00278 int width, int height,
00279 int rgb_stride, int y_stride, int uv_stride,
00280 int cpu, int alphaones)
00281 {
00282 int i;
00283
00284 rgb_stride -= 4 * width;
00285 y_stride -= width;
00286 uv_stride -= width >> 1;
00287 width >>= 3;
00288
00289 do {
00290 i = width;
00291 do {
00292 mmx_yuv2rgb (py, pu, pv);
00293 mmx_unpack_32rgb (image, cpu, alphaones);
00294 py += 8;
00295 pu += 4;
00296 pv += 4;
00297 image += 32;
00298 } while (--i);
00299
00300 py += y_stride;
00301 image += rgb_stride;
00302 if (height & 1) {
00303 pu += uv_stride;
00304 pv += uv_stride;
00305 } else {
00306 pu -= 4 * width;
00307 pv -= 4 * width;
00308 }
00309 } while (--height);
00310
00311 emms();
00312 }
00313
00314 static void mmxext_rgb16 (uint8_t * image,
00315 uint8_t * py, uint8_t * pu, uint8_t * pv,
00316 int width, int height,
00317 int rgb_stride, int y_stride, int uv_stride,
00318 int alphaones)
00319 {
00320 yuv420_rgb16 (image, py, pu, pv, width, height,
00321 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00322 }
00323
00324 static void mmxext_argb32 (uint8_t * image,
00325 uint8_t * py, uint8_t * pu, uint8_t * pv,
00326 int width, int height,
00327 int rgb_stride, int y_stride, int uv_stride,
00328 int alphaones)
00329 {
00330 yuv420_argb32 (image, py, pu, pv, width, height,
00331 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00332 }
00333
00334 static void mmx_rgb16 (uint8_t * image,
00335 uint8_t * py, uint8_t * pu, uint8_t * pv,
00336 int width, int height,
00337 int rgb_stride, int y_stride, int uv_stride,
00338 int alphaones)
00339 {
00340 yuv420_rgb16 (image, py, pu, pv, width, height,
00341 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00342 }
00343
00344 static void mmx_argb32 (uint8_t * image,
00345 uint8_t * py, uint8_t * pu, uint8_t * pv,
00346 int width, int height,
00347 int rgb_stride, int y_stride, int uv_stride,
00348 int alphaones)
00349 {
00350 yuv420_argb32 (image, py, pu, pv, width, height,
00351 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00352 }
00353 #endif
00354
00364 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode)
00365 {
00366 #if HAVE_MMX
00367 if ((bpp == 16) && (mode == MODE_RGB))
00368 return mmxext_rgb16;
00369 else if ((bpp == 32) && (mode == MODE_RGB))
00370 return mmxext_argb32;
00371 #endif
00372
00373 (void)bpp;
00374 (void)mode;
00375
00376 return NULL;
00377 }
00378
00388 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
00389 {
00390 #if HAVE_MMX
00391 if ((bpp == 16) && (mode == MODE_RGB))
00392 return mmx_rgb16;
00393 else if ((bpp == 32) && (mode == MODE_RGB))
00394 return mmx_argb32;
00395 #endif
00396 if ((bpp == 32) && (mode == MODE_RGB))
00397 return yuv420_argb32_non_mmx;
00398
00399 return NULL;
00400 }
00401
00402 #define SCALE_BITS 10
00403
00404 #define C_Y (76309 >> (16 - SCALE_BITS))
00405 #define C_RV (117504 >> (16 - SCALE_BITS))
00406 #define C_BU (138453 >> (16 - SCALE_BITS))
00407 #define C_GU (13954 >> (16 - SCALE_BITS))
00408 #define C_GV (34903 >> (16 - SCALE_BITS))
00409
00410 #if defined(__FreeBSD__)
00411
00412
00413 #undef UCHAR_MAX
00414 #define UCHAR_MAX (int)__UCHAR_MAX
00415 #endif
00416
00417 #define RGBOUT(r, g, b, y1)\
00418 {\
00419 y = (y1 - 16) * C_Y;\
00420 r = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\
00421 g = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\
00422 b = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\
00423 }
00424
00425 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00426 unsigned char *pu, unsigned char *pv,
00427 int h_size, int v_size, int rgb_stride,
00428 int y_stride, int uv_stride, int alphaones)
00429 {
00430 unsigned char *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
00431 int w, y, cb, cr, r_add, g_add, b_add, width2;
00432 int dstwidth;
00433
00434
00435 #if HAVE_BIGENDIAN
00436 #define R_OI 1
00437 #define G_OI 2
00438 #define B_OI 3
00439 #define A_OI 0
00440 #else
00441 #define R_OI 2
00442 #define G_OI 1
00443 #define B_OI 0
00444 #define A_OI 3
00445 #endif
00446
00447
00448 (void) rgb_stride; (void) y_stride; (void) uv_stride;
00449
00450 d = image;
00451 y1_ptr = py;
00452 cb_ptr = pu;
00453 cr_ptr = pv;
00454 dstwidth = h_size * 4;
00455 width2 = h_size / 2;
00456
00457 for(;v_size > 0; v_size -= 2) {
00458 d1 = d;
00459 d2 = d + h_size * 4;
00460 y2_ptr = y1_ptr + h_size;
00461 for(w = width2; w > 0; w--) {
00462 cb = cb_ptr[0] - 128;
00463 cr = cr_ptr[0] - 128;
00464 r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
00465 g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
00466 b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
00467
00468
00469 RGBOUT(d1[R_OI], d1[G_OI], d1[B_OI], y1_ptr[0]);
00470 RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]);
00471 RGBOUT(d2[R_OI], d2[G_OI], d2[B_OI], y2_ptr[0]);
00472 RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]);
00473
00474 if (alphaones)
00475 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff;
00476 else
00477 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0;
00478
00479 d1 += 8;
00480 d2 += 8;
00481 y1_ptr += 2;
00482 y2_ptr += 2;
00483 cb_ptr++;
00484 cr_ptr++;
00485 }
00486 d += 2 * dstwidth;
00487 y1_ptr += h_size;
00488 }
00489 }
00490
00491 #define SCALEBITS 8
00492 #define ONE_HALF (1 << (SCALEBITS - 1))
00493 #define FIX(x) ((int) ((x) * (1L<<SCALEBITS) + 0.5))
00494
00499 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr,
00500 unsigned char *alpha, unsigned char *src,
00501 int width, int height, int srcwidth)
00502 {
00503 int wrap, wrap4, x, y;
00504 int r, g, b, r1, g1, b1;
00505 unsigned char *p;
00506
00507
00508 #if HAVE_BIGENDIAN
00509 #define R_II 3
00510 #define G_II 2
00511 #define B_II 1
00512 #define A_II 0
00513 #else
00514 #define R_II 0
00515 #define G_II 1
00516 #define B_II 2
00517 #define A_II 3
00518 #endif
00519
00520 wrap = (width + 1) & ~1;
00521 wrap4 = srcwidth * 4;
00522 p = src;
00523 for(y=0;y+1<height;y+=2) {
00524 for(x=0;x+1<width;x+=2) {
00525 r = p[R_II];
00526 g = p[G_II];
00527 b = p[B_II];
00528 r1 = r;
00529 g1 = g;
00530 b1 = b;
00531 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00532 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00533 alpha[0] = p[A_II];
00534
00535 r = p[R_II+4];
00536 g = p[G_II+4];
00537 b = p[B_II+4];
00538 r1 += r;
00539 g1 += g;
00540 b1 += b;
00541 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00542 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00543 alpha[1] = p[A_II+4];
00544
00545 p += wrap4;
00546 lum += wrap;
00547 alpha += wrap;
00548
00549 r = p[R_II];
00550 g = p[G_II];
00551 b = p[B_II];
00552 r1 += r;
00553 g1 += g;
00554 b1 += b;
00555 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00556 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00557 alpha[0] = p[A_II];
00558
00559 r = p[R_II+4];
00560 g = p[G_II+4];
00561 b = p[B_II+4];
00562 r1 += r;
00563 g1 += g;
00564 b1 += b;
00565 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00566 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00567 alpha[1] = p[A_II+4];
00568
00569 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00570 FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00571 128;
00572 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00573 FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00574 128;
00575
00576 cb++;
00577 cr++;
00578 p += -wrap4 + 2 * 4;
00579 lum += -wrap + 2;
00580 alpha += -wrap + 2;
00581 }
00582 if (width & 1) {
00583 r = p[R_II];
00584 g = p[G_II];
00585 b = p[B_II];
00586 r1 = r;
00587 g1 = g;
00588 b1 = b;
00589 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00590 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00591 alpha[0] = p[A_II];
00592
00593 lum[1] = 16;
00594 alpha[1] = 0;
00595
00596 p += wrap4;
00597 lum += wrap;
00598 alpha += wrap;
00599
00600 r = p[R_II];
00601 g = p[G_II];
00602 b = p[B_II];
00603 r1 += r;
00604 g1 += g;
00605 b1 += b;
00606 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00607 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00608 alpha[0] = p[A_II];
00609
00610 lum[1] = 16;
00611 alpha[1] = 0;
00612
00613 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00614 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00615 128;
00616 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00617 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00618 128;
00619
00620 cb++;
00621 cr++;
00622 p += -wrap4 + 4;
00623 lum += -wrap + 2;
00624 alpha += -wrap + 2;
00625 }
00626 p += wrap4 * 2 - width * 4;
00627 lum += wrap;
00628 alpha += wrap;
00629 }
00630 if (height & 1) {
00631 for(x=0;x+1<width;x+=2) {
00632 r = p[R_II];
00633 g = p[G_II];
00634 b = p[B_II];
00635 r1 = r;
00636 g1 = g;
00637 b1 = b;
00638 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00639 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00640 alpha[0] = p[A_II];
00641
00642 r = p[R_II+4];
00643 g = p[G_II+4];
00644 b = p[B_II+4];
00645 r1 += r;
00646 g1 += g;
00647 b1 += b;
00648 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00649 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00650 alpha[1] = p[A_II+4];
00651
00652 lum += wrap;
00653 alpha += wrap;
00654
00655 lum[0] = 16;
00656 alpha[0] = 0;
00657
00658 lum[1] = 16;
00659 alpha[1] = 0;
00660
00661 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00662 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00663 128;
00664 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00665 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00666 128;
00667
00668 cb++;
00669 cr++;
00670 p += 2 * 4;
00671 lum += -wrap + 2;
00672 alpha += -wrap + 2;
00673 }
00674 if (width & 1) {
00675 r = p[R_II];
00676 g = p[G_II];
00677 b = p[B_II];
00678 r1 = r;
00679 g1 = g;
00680 b1 = b;
00681 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00682 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00683 alpha[0] = p[A_II];
00684
00685 lum[1] = 16;
00686 alpha[1] = 0;
00687
00688 lum += wrap;
00689 alpha += wrap;
00690
00691 lum[0] = 16;
00692 alpha[0] = 0;
00693
00694 lum[1] = 16;
00695 alpha[1] = 0;
00696
00697 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00698 FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00699 128;
00700 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00701 FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00702 128;
00703
00704 cb++;
00705 cr++;
00706 p += 4;
00707 lum += -wrap + 2;
00708 alpha += -wrap + 2;
00709 }
00710 }
00711 }
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00739 static void non_vec_i420_2vuy(
00740 uint8_t *image, int vuy_stride,
00741 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00742 int y_stride, int u_stride, int v_stride,
00743 int h_size, int v_size)
00744 {
00745 uint8_t *pi1, *pi2;
00746 const uint8_t *py1;
00747 const uint8_t *py2;
00748 const uint8_t *pu1;
00749 const uint8_t *pv1;
00750 int x, y;
00751
00752 for (y = 0; y < (v_size>>1); y++)
00753 {
00754 pi1 = image + 2*y * vuy_stride;
00755 pi2 = image + 2*y * vuy_stride + vuy_stride;
00756 py1 = py + 2*y * y_stride;
00757 py2 = py + 2*y * y_stride + y_stride;
00758 pu1 = pu + y * u_stride;
00759 pv1 = pv + y * v_stride;
00760
00761 for (x = 0; x < (h_size>>1); x++)
00762 {
00763 pi1[4*x+0] = pu1[1*x+0];
00764 pi2[4*x+0] = pu1[1*x+0];
00765 pi1[4*x+1] = py1[2*x+0];
00766 pi2[4*x+1] = py2[2*x+0];
00767 pi1[4*x+2] = pv1[1*x+0];
00768 pi2[4*x+2] = pv1[1*x+0];
00769 pi1[4*x+3] = py1[2*x+1];
00770 pi2[4*x+3] = py2[2*x+1];
00771 }
00772 }
00773 }
00774
00775 #if HAVE_MMX
00776
00788 static void mmx_i420_2vuy(
00789 uint8_t *image, int vuy_stride,
00790 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00791 int y_stride, int u_stride, int v_stride,
00792 int h_size, int v_size)
00793 {
00794 uint8_t *pi1, *pi2;
00795 const uint8_t *py1 = py;
00796 const uint8_t *py2 = py;
00797 const uint8_t *pu1 = pu;
00798 const uint8_t *pv1 = pv;
00799
00800 int x,y;
00801
00802 if ((h_size % 16) || (v_size % 2))
00803 {
00804 non_vec_i420_2vuy(image, vuy_stride,
00805 py, pu, pv, y_stride, u_stride, v_stride,
00806 h_size, v_size);
00807 return;
00808 }
00809
00810 emms();
00811
00812 for (y = 0; y < (v_size>>1); y++)
00813 {
00814 pi1 = image + 2*y * vuy_stride;
00815 pi2 = image + 2*y * vuy_stride + vuy_stride;
00816 py1 = py + 2*y * y_stride;
00817 py2 = py + 2*y * y_stride + y_stride;
00818 pu1 = pu + y * u_stride;
00819 pv1 = pv + y * v_stride;
00820
00821 for (x = 0; x < h_size / 16; x++)
00822 {
00823 movq_m2r (*py1, mm0);
00824 movq_m2r (*py2, mm1);
00825 movq_m2r (*pu1, mm2);
00826 movq_m2r (*pv1, mm3);
00827
00828 movq_r2r (mm2, mm4);
00829
00830 punpcklbw_r2r (mm3, mm2);
00831 punpckhbw_r2r (mm3, mm4);
00832
00833 movq_r2r (mm2, mm5);
00834 movq_r2r (mm2, mm6);
00835 punpcklbw_r2r (mm0, mm5);
00836 punpckhbw_r2r (mm0, mm6);
00837
00838 movntq_r2m (mm5, *(pi1));
00839 movntq_r2m (mm6, *(pi1+8));
00840
00841 movq_r2r (mm2, mm5);
00842 movq_r2r (mm2, mm6);
00843 punpcklbw_r2r (mm1, mm5);
00844 punpckhbw_r2r (mm1, mm6);
00845
00846 movntq_r2m (mm5, *(pi2));
00847 movntq_r2m (mm6, *(pi2+8));
00848
00849
00850 movq_m2r (*(py1+8), mm0);
00851 movq_m2r (*(py2+8), mm1);
00852
00853 movq_r2r (mm4, mm5);
00854 movq_r2r (mm4, mm6);
00855 punpcklbw_r2r (mm0, mm5);
00856 punpckhbw_r2r (mm0, mm6);
00857
00858 movntq_r2m (mm5, *(pi1+16));
00859 movntq_r2m (mm6, *(pi1+24));
00860
00861 movq_r2r (mm4, mm5);
00862 movq_r2r (mm4, mm6);
00863 punpcklbw_r2r (mm1, mm5);
00864 punpckhbw_r2r (mm1, mm6);
00865
00866 movntq_r2m (mm5, *(pi2+16));
00867 movntq_r2m (mm6, *(pi2+24));
00868
00869 pi1 += 32;
00870 pi2 += 32;
00871 py1 += 16;
00872 py2 += 16;
00873 pu1 += 8;
00874 pv1 += 8;
00875 }
00876 }
00877
00878 emms();
00879 }
00880
00881 #endif // HAVE_MMX
00882
00883 #if HAVE_ALTIVEC
00884
00885
00886
00887 #define VEC_NEXT_LINES() \
00888 pi1 = pi2; \
00889 pi2 += h_size * 2; \
00890 py1 = py2; \
00891 py2 += h_size;
00892
00893 #define VEC_LOAD_UV() \
00894 u_vec = vec_ld(0, pu); pu += 16; \
00895 v_vec = vec_ld(0, pv); pv += 16;
00896
00897 #define VEC_MERGE(a) \
00898 uv_vec = a(u_vec, v_vec); \
00899 y_vec = vec_ld(0, py1); py1 += 16; \
00900 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16; \
00901 vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16; \
00902 y_vec = vec_ld(0, py2); py2 += 16; \
00903 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16; \
00904 vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16;
00905
00918 static void altivec_i420_2vuy(
00919 uint8_t *image, int vuy_stride,
00920 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00921 int y_stride, int u_stride, int v_stride,
00922 int h_size, int v_size)
00923 {
00924 uint8_t *pi1, *pi2 = image;
00925 const uint8_t *py1;
00926 const uint8_t *py2 = py;
00927
00928 int x, y;
00929
00930 vector unsigned char u_vec;
00931 vector unsigned char v_vec;
00932 vector unsigned char uv_vec;
00933 vector unsigned char y_vec;
00934
00935 int vuy_extra = vuy_stride - (h_size<<1);
00936 int y_extra = y_stride - (h_size);
00937 int u_extra = u_stride - (h_size>>1);
00938 int v_extra = v_stride - (h_size>>1);
00939
00940 if (vuy_extra || y_extra || u_extra || v_extra)
00941 {
00942
00943 non_vec_i420_2vuy(image, vuy_stride,
00944 py, pu, pv,
00945 y_stride, u_stride, v_stride,
00946 h_size, v_size);
00947 return;
00948 }
00949
00950 if (!((h_size % 32) || (v_size % 2)))
00951 {
00952
00953 for (y = v_size / 2; y--; )
00954 {
00955 VEC_NEXT_LINES();
00956 for (x = h_size / 32; x--; )
00957 {
00958 VEC_LOAD_UV();
00959 VEC_MERGE(vec_mergeh);
00960 VEC_MERGE(vec_mergel);
00961 }
00962 }
00963
00964 }
00965 else if (!((h_size % 16) || (v_size % 4)))
00966 {
00967
00968 for (y = v_size / 4; y--; )
00969 {
00970
00971 VEC_NEXT_LINES();
00972 for (x = h_size / 32; x--; )
00973 {
00974 VEC_LOAD_UV();
00975 VEC_MERGE(vec_mergeh);
00976 VEC_MERGE(vec_mergel);
00977 }
00978
00979
00980 VEC_LOAD_UV();
00981 VEC_MERGE(vec_mergeh);
00982
00983
00984 VEC_NEXT_LINES();
00985 VEC_MERGE(vec_mergel);
00986
00987
00988 for (x = h_size / 32; x--; )
00989 {
00990 VEC_LOAD_UV();
00991 VEC_MERGE(vec_mergeh);
00992 VEC_MERGE(vec_mergel);
00993 }
00994 }
00995 }
00996 else
00997 {
00998
00999 non_vec_i420_2vuy(image, vuy_stride,
01000 py, pu, pv,
01001 y_stride, u_stride, v_stride,
01002 h_size, v_size);
01003 }
01004 }
01005
01006 #endif // HAVE_ALTIVEC
01007
01008
01022 conv_i420_2vuy_fun get_i420_2vuy_conv(void)
01023 {
01024 #if HAVE_ALTIVEC
01025 if (has_altivec())
01026 return altivec_i420_2vuy;
01027 #endif
01028 #if HAVE_MMX
01029 return mmx_i420_2vuy;
01030 #else
01031 return non_vec_i420_2vuy;
01032 #endif
01033 }
01034
01044 static void non_vec_2vuy_i420(
01045 uint8_t *py, uint8_t *pu, uint8_t *pv,
01046 int y_stride, int u_stride, int v_stride,
01047 const uint8_t *image, int vuy_stride,
01048 int h_size, int v_size)
01049 {
01050 const uint8_t *pi1;
01051 const uint8_t *pi2;
01052 uint8_t *py1, *py2, *pu1, *pv1;
01053 int x, y;
01054
01055 for (y = 0; y < (v_size>>1); y++)
01056 {
01057 pi1 = image + 2*y * vuy_stride;
01058 pi2 = image + 2*y * vuy_stride + vuy_stride;
01059 py1 = py + 2*y * y_stride;
01060 py2 = py + 2*y * y_stride + y_stride;
01061 pu1 = pu + y * u_stride;
01062 pv1 = pv + y * v_stride;
01063
01064 for (x = 0; x < (h_size>>1); x++)
01065 {
01066 pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1;
01067 py1[2*x+0] = pi1[4*x+1];
01068 py2[2*x+0] = pi2[4*x+1];
01069 pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1;
01070 py1[2*x+1] = pi1[4*x+3];
01071 py2[2*x+1] = pi2[4*x+3];
01072 }
01073 }
01074 }
01075
01076 #if HAVE_ALTIVEC
01077
01078
01079
01080 #define VEC_READ_LINE(ptr, y, uv) \
01081 pa_vec = vec_ld(0, ptr); ptr += 16; \
01082 pb_vec = vec_ld(0, ptr); ptr += 16; \
01083 vec_st(vec_pack((vector unsigned short)pa_vec, \
01084 (vector unsigned short)pb_vec), \
01085 0, y); y += 16; \
01086 uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec), \
01087 vec_sr((vector unsigned short)pb_vec, eight_vec));
01088
01089 #define VEC_SPLIT(a) \
01090 VEC_READ_LINE(pi1, py1, uv1_vec); \
01091 VEC_READ_LINE(pi2, py2, uv2_vec); \
01092 a = vec_avg(uv1_vec, uv2_vec);
01093
01094 #define VEC_STORE_UV() \
01095 vec_st(vec_pack((vector unsigned short)uva_vec, \
01096 (vector unsigned short)uvb_vec), \
01097 0, pv); pv += 16; \
01098 vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec), \
01099 vec_sr((vector unsigned short)uvb_vec, eight_vec)), \
01100 0, pu); pu += 16;
01101
01102
01112 static void altivec_2vuy_i420(
01113 uint8_t *py, uint8_t *pu, uint8_t *pv,
01114 int y_stride, int u_stride, int v_stride,
01115 const uint8_t *image, int vuy_stride,
01116 int h_size, int v_size)
01117 {
01118 const uint8_t *pi1;
01119 const uint8_t *pi2 = image;
01120 uint8_t *py1, *py2 = py;
01121
01122 int x, y;
01123
01124 vector unsigned short eight_vec = vec_splat_u16(8);
01125 vector unsigned char pa_vec, pb_vec,
01126 uv1_vec, uv2_vec,
01127 uva_vec, uvb_vec;
01128
01129 int vuy_extra = vuy_stride - (h_size<<1);
01130 int y_extra = y_stride - (h_size);
01131 int u_extra = u_stride - (h_size>>1);
01132 int v_extra = v_stride - (h_size>>1);
01133
01134 if (vuy_extra || y_extra || u_extra || v_extra)
01135 {
01136
01137 non_vec_2vuy_i420(py, pu, pv,
01138 y_stride, u_stride, v_stride,
01139 image, vuy_stride,
01140 h_size, v_size);
01141 return;
01142 }
01143
01144 if (!((h_size % 32) || (v_size % 2)))
01145 {
01146
01147 for (y = v_size / 2; y--; )
01148 {
01149 VEC_NEXT_LINES();
01150 for (x = h_size / 32; x--; )
01151 {
01152 VEC_SPLIT(uva_vec);
01153 VEC_SPLIT(uvb_vec);
01154 VEC_STORE_UV();
01155 }
01156 }
01157 }
01158 else if (!((h_size % 16) || (v_size % 4)))
01159 {
01160
01161 for (y = v_size / 4; y--; )
01162 {
01163
01164 VEC_NEXT_LINES();
01165 for (x = h_size / 32; x--; )
01166 {
01167 VEC_SPLIT(uva_vec);
01168 VEC_SPLIT(uvb_vec);
01169 VEC_STORE_UV();
01170 }
01171
01172
01173 VEC_SPLIT(uva_vec);
01174
01175
01176 VEC_NEXT_LINES();
01177 VEC_SPLIT(uvb_vec);
01178 VEC_STORE_UV();
01179
01180
01181 for (x = h_size / 32; x--; )
01182 {
01183 VEC_SPLIT(uva_vec);
01184 VEC_SPLIT(uvb_vec);
01185 VEC_STORE_UV();
01186 }
01187 }
01188 }
01189 else
01190 {
01191
01192 non_vec_2vuy_i420(py, pu, pv,
01193 y_stride, u_stride, v_stride,
01194 image, vuy_stride,
01195 h_size, v_size);
01196 }
01197 }
01198
01199 #endif // HAVE_ALTIVEC
01200
01201
01215 conv_2vuy_i420_fun get_2vuy_i420_conv(void)
01216 {
01217 #if HAVE_ALTIVEC
01218 if (has_altivec())
01219 return altivec_2vuy_i420;
01220 #endif
01221 return non_vec_2vuy_i420;
01222 }