00001
00002
00003 #include <stdint.h>
00004 #include <QSize>
00005 #include "compat.h"
00006 #include "util-opengl.h"
00007
00008 #ifdef MMX
00009 extern "C" {
00010 #include "ffmpeg-mmx.h"
00011 }
00012
00013 static mmx_t mmx_1s = {0xffffffffffffffffLL};
00014
00015 static inline void mmx_pack_alpha1s_high(uint8_t *y1, uint8_t *y2)
00016 {
00017 movq_m2r (mmx_1s, mm4);
00018 punpckhbw_m2r (*y1, mm4);
00019 movq_m2r (mmx_1s, mm7);
00020 punpckhbw_m2r (*y2, mm7);
00021 }
00022
00023 static inline void mmx_pack_alpha1s_low(uint8_t *y1, uint8_t *y2)
00024 {
00025 movq_m2r (mmx_1s, mm4);
00026 punpcklbw_m2r (*y1, mm4);
00027 movq_m2r (mmx_1s, mm7);
00028 punpcklbw_m2r (*y2, mm7);
00029 }
00030
00031 static inline void mmx_pack_middle(uint8_t *dest1, uint8_t *dest2)
00032 {
00033 movq_r2r (mm3, mm5);
00034 punpcklbw_r2r (mm2, mm5);
00035
00036 movq_r2r (mm5, mm6);
00037 punpcklbw_r2r (mm4, mm6);
00038 movq_r2m (mm6, *(dest1));
00039
00040 movq_r2r (mm5, mm6);
00041 punpckhbw_r2r (mm4, mm6);
00042 movq_r2m (mm6, *(dest1 + 8));
00043
00044 movq_r2r (mm5, mm6);
00045 punpcklbw_r2r (mm7, mm6);
00046 movq_r2m (mm6, *(dest2));
00047
00048 movq_r2r (mm5, mm6);
00049 punpckhbw_r2r (mm7, mm6);
00050 movq_r2m (mm6, *(dest2 + 8));
00051 }
00052
00053 static inline void mmx_pack_end(uint8_t *dest1, uint8_t *dest2)
00054 {
00055 punpckhbw_r2r (mm2, mm3);
00056
00057 movq_r2r (mm3, mm6);
00058 punpcklbw_r2r (mm4, mm6);
00059 movq_r2m (mm6, *(dest1 + 16));
00060
00061 movq_r2r (mm3, mm6);
00062 punpckhbw_r2r (mm4, mm6);
00063 movq_r2m (mm6, *(dest1 + 24));
00064
00065 movq_r2r (mm3, mm6);
00066 punpcklbw_r2r (mm7, mm6);
00067 movq_r2m (mm6, *(dest2 + 16));
00068
00069 punpckhbw_r2r (mm7, mm3);
00070 movq_r2m (mm3, *(dest2 + 24));
00071 }
00072
00073 static inline void mmx_pack_easy(uint8_t *dest, uint8_t *y)
00074 {
00075 movq_m2r (mmx_1s, mm4);
00076 punpcklbw_m2r (*y, mm4);
00077
00078 movq_r2r (mm3, mm5);
00079 punpcklbw_r2r (mm2, mm5);
00080
00081 movq_r2r (mm5, mm6);
00082 punpcklbw_r2r (mm4, mm6);
00083 movq_r2m (mm6, *(dest));
00084
00085 movq_r2r (mm5, mm6);
00086 punpckhbw_r2r (mm4, mm6);
00087 movq_r2m (mm6, *(dest + 8));
00088
00089 movq_m2r (mmx_1s, mm4);
00090 punpckhbw_m2r (*y, mm4);
00091
00092 punpckhbw_r2r (mm2, mm3);
00093
00094 movq_r2r (mm3, mm6);
00095 punpcklbw_r2r (mm4, mm6);
00096 movq_r2m (mm6, *(dest + 16));
00097
00098 punpckhbw_r2r (mm4, mm3);
00099 movq_r2m (mm3, *(dest + 24));
00100 }
00101
00102 static mmx_t mmx_0s = {0x0000000000000000LL};
00103 static mmx_t round = {0x0002000200020002LL};
00104
00105 static inline void mmx_interp_start(uint8_t *left, uint8_t *right)
00106 {
00107 movd_m2r (*left, mm5);
00108 punpcklbw_m2r (mmx_0s, mm5);
00109
00110 movq_r2r (mm5, mm4);
00111 paddw_r2r (mm4, mm4);
00112 paddw_r2r (mm5, mm4);
00113 paddw_m2r (round, mm4);
00114
00115 movd_m2r (*right, mm5);
00116 punpcklbw_m2r (mmx_0s, mm5);
00117 paddw_r2r (mm5, mm4);
00118
00119 psrlw_i2r (2, mm4);
00120 }
00121
00122 static inline void mmx_interp_endu(void)
00123 {
00124 movq_r2r (mm4, mm2);
00125 psllw_i2r (8, mm2);
00126 paddb_r2r (mm4, mm2);
00127 }
00128
00129 static inline void mmx_interp_endv(void)
00130 {
00131 movq_r2r (mm4, mm3);
00132 psllw_i2r (8, mm3);
00133 paddb_r2r (mm4, mm3);
00134 }
00135
00136 static inline void mmx_pack_chroma(uint8_t *u, uint8_t *v)
00137 {
00138 movd_m2r (*u, mm2);
00139 movd_m2r (*v, mm3);
00140 punpcklbw_r2r (mm2, mm2);
00141 punpcklbw_r2r (mm3, mm3);
00142 }
00143 #endif // MMX
00144
00145 static inline void c_interp(uint8_t *dest, uint8_t *a, uint8_t *b,
00146 uint8_t *c, uint8_t *d)
00147 {
00148 unsigned int tmp = (unsigned int) *a;
00149 tmp *= 3;
00150 tmp += 2;
00151 tmp += (unsigned int) *c;
00152 dest[0] = (uint8_t) (tmp >> 2);
00153
00154 tmp = (unsigned int) *b;
00155 tmp *= 3;
00156 tmp += 2;
00157 tmp += (unsigned int) *d;
00158 dest[1] = (uint8_t) (tmp >> 2);
00159
00160 tmp = (unsigned int) *c;
00161 tmp *= 3;
00162 tmp += 2;
00163 tmp += (unsigned int) *a;
00164 dest[2] = (uint8_t) (tmp >> 2);
00165
00166 tmp = (unsigned int) *d;
00167 tmp *= 3;
00168 tmp += 2;
00169 tmp += (unsigned int) *b;
00170 dest[3] = (uint8_t) (tmp >> 2);
00171 }
00172
00173 void pack_yv12progressive(const unsigned char *source,
00174 const unsigned char *dest,
00175 const int *offsets, const int *pitches,
00176 const QSize &size)
00177 {
00178 const int width = size.width();
00179 const int height = size.height();
00180
00181 if (height % 2 || width % 2)
00182 return;
00183
00184 #ifdef MMX
00185 int residual = width % 8;
00186 int mmx_width = width - residual;
00187 int c_start_w = mmx_width;
00188 #else
00189 int residual = 0;
00190 int c_start_w = 0;
00191 #endif
00192
00193 uint bgra_width = width << 2;
00194 uint chroma_width = width >> 1;
00195
00196 uint y_extra = (pitches[0] << 1) - width + residual;
00197 uint u_extra = pitches[1] - chroma_width + (residual >> 1);
00198 uint v_extra = pitches[2] - chroma_width + (residual >> 1);
00199 uint d_extra = bgra_width + (residual << 2);
00200
00201 uint8_t *ypt_1 = (uint8_t *)source + offsets[0];
00202 uint8_t *ypt_2 = ypt_1 + pitches[0];
00203 uint8_t *upt = (uint8_t *)source + offsets[1];
00204 uint8_t *vpt = (uint8_t *)source + offsets[2];
00205 uint8_t *dst_1 = (uint8_t *) dest;
00206 uint8_t *dst_2 = dst_1 + bgra_width;
00207
00208 #ifdef MMX
00209 for (int row = 0; row < height; row += 2)
00210 {
00211 for (int col = 0; col < mmx_width; col += 8)
00212 {
00213 mmx_pack_chroma(upt, vpt);
00214 mmx_pack_alpha1s_low(ypt_1, ypt_2);
00215 mmx_pack_middle(dst_1, dst_2);
00216 mmx_pack_alpha1s_high(ypt_1, ypt_2);
00217 mmx_pack_end(dst_1, dst_2);
00218
00219 dst_1 += 32; dst_2 += 32;
00220 ypt_1 += 8; ypt_2 += 8;
00221 upt += 4; vpt += 4;
00222
00223 }
00224 ypt_1 += y_extra; ypt_2 += y_extra;
00225 upt += u_extra; vpt += v_extra;
00226 dst_1 += d_extra; dst_2 += d_extra;
00227 }
00228
00229 emms();
00230
00231 if (residual)
00232 {
00233 y_extra = (pitches[0] << 1) - width + mmx_width;
00234 u_extra = pitches[1] - chroma_width + (mmx_width >> 1);
00235 v_extra = pitches[2] - chroma_width + (mmx_width >> 1);
00236 d_extra = bgra_width + (mmx_width << 2);
00237
00238 ypt_1 = (uint8_t *)source + offsets[0] + mmx_width;
00239 ypt_2 = ypt_1 + pitches[0];
00240 upt = (uint8_t *)source + offsets[1] + (mmx_width>>1);
00241 vpt = (uint8_t *)source + offsets[2] + (mmx_width>>1);
00242 dst_1 = (uint8_t *) dest + (mmx_width << 2);
00243 dst_2 = dst_1 + bgra_width;
00244 }
00245 else
00246 {
00247 return;
00248 }
00249 #endif //MMX
00250
00251 for (int row = 0; row < height; row += 2)
00252 {
00253 for (int col = c_start_w; col < width; col += 2)
00254 {
00255 *(dst_1++) = *vpt; *(dst_2++) = *vpt;
00256 *(dst_1++) = 255; *(dst_2++) = 255;
00257 *(dst_1++) = *upt; *(dst_2++) = *upt;
00258 *(dst_1++) = *(ypt_1++);
00259 *(dst_2++) = *(ypt_2++);
00260
00261 *(dst_1++) = *vpt; *(dst_2++) = *(vpt++);
00262 *(dst_1++) = 255; *(dst_2++) = 255;
00263 *(dst_1++) = *upt; *(dst_2++) = *(upt++);
00264 *(dst_1++) = *(ypt_1++);
00265 *(dst_2++) = *(ypt_2++);
00266 }
00267 ypt_1 += y_extra; ypt_2 += y_extra;
00268 upt += u_extra; vpt += v_extra;
00269 dst_1 += d_extra; dst_2 += d_extra;
00270 }
00271 }
00272
00273 void pack_yv12interlaced(const unsigned char *source,
00274 const unsigned char *dest,
00275 const int *offsets,
00276 const int *pitches,
00277 const QSize &size)
00278 {
00279 int width = size.width();
00280 int height = size.height();
00281
00282 if (height % 4 || width % 2)
00283 return;
00284
00285 uint bgra_width = width << 2;
00286 uint dwrap = (bgra_width << 2) - bgra_width;
00287 uint chroma_width = width >> 1;
00288 uint ywrap = (pitches[0] << 1) - width;
00289 uint uwrap = (pitches[1] << 1) - chroma_width;
00290 uint vwrap = (pitches[2] << 1) - chroma_width;
00291
00292 uint8_t *ypt_1 = (uint8_t *)source + offsets[0];
00293 uint8_t *ypt_2 = ypt_1 + pitches[0];
00294 uint8_t *ypt_3 = ypt_1 + (pitches[0] * (height - 2));
00295 uint8_t *ypt_4 = ypt_3 + pitches[0];
00296
00297 uint8_t *u1 = (uint8_t *)source + offsets[1];
00298 uint8_t *v1 = (uint8_t *)source + offsets[2];
00299 uint8_t *u2 = u1 + pitches[1]; uint8_t *v2 = v1 + pitches[2];
00300 uint8_t *u3 = u1 + (pitches[1] * ((height - 4) >> 1));
00301 uint8_t *v3 = v1 + (pitches[2] * ((height - 4) >> 1));
00302 uint8_t *u4 = u3 + pitches[1]; uint8_t *v4 = v3 + pitches[2];
00303
00304 uint8_t *dst_1 = (uint8_t *) dest;
00305 uint8_t *dst_2 = dst_1 + bgra_width;
00306 uint8_t *dst_3 = dst_1 + (bgra_width * (height - 2));
00307 uint8_t *dst_4 = dst_3 + bgra_width;
00308
00309 #ifdef MMX
00310
00311 if (!(width % 8))
00312 {
00313
00314 for (int col = 0; col < width; col += 8)
00315 {
00316 mmx_pack_chroma(u1, v1);
00317 mmx_pack_easy(dst_1, ypt_1);
00318 mmx_pack_chroma(u2, v2);
00319 mmx_pack_easy(dst_2, ypt_2);
00320 mmx_pack_chroma(u3, v3);
00321 mmx_pack_easy(dst_3, ypt_3);
00322 mmx_pack_chroma(u4, v4);
00323 mmx_pack_easy(dst_4, ypt_4);
00324
00325 dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;
00326 ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8;
00327 u1 += 4; v1 += 4; u2 += 4; v2 += 4;
00328 u3 += 4; v3 += 4; u4 += 4; v4 += 4;
00329 }
00330
00331 ypt_1 += ywrap; ypt_2 += ywrap;
00332 dst_1 += bgra_width; dst_2 += bgra_width;
00333
00334 ypt_3 = ypt_2 + pitches[0];
00335 ypt_4 = ypt_3 + pitches[0];
00336 dst_3 = dst_2 + bgra_width;
00337 dst_4 = dst_3 + bgra_width;
00338
00339 ywrap = (pitches[0] << 2) - width;
00340
00341 u1 = (uint8_t *)source + offsets[1];
00342 v1 = (uint8_t *)source + offsets[2];
00343 u2 = u1 + pitches[1]; v2 = v1 + pitches[2];
00344 u3 = u2 + pitches[1]; v3 = v2 + pitches[2];
00345 u4 = u3 + pitches[1]; v4 = v3 + pitches[2];
00346
00347 height -= 4;
00348
00349
00350 for (int row = 0 ; row < height; row += 4)
00351 {
00352 for (int col = 0; col < width; col += 8)
00353 {
00354 mmx_interp_start(u1, u3); mmx_interp_endu();
00355 mmx_interp_start(v1, v3); mmx_interp_endv();
00356 mmx_pack_easy(dst_1, ypt_1);
00357
00358 mmx_interp_start(u2, u4); mmx_interp_endu();
00359 mmx_interp_start(v2, v4); mmx_interp_endv();
00360 mmx_pack_easy(dst_2, ypt_2);
00361
00362 mmx_interp_start(u3, u1); mmx_interp_endu();
00363 mmx_interp_start(v3, v1); mmx_interp_endv();
00364 mmx_pack_easy(dst_3, ypt_3);
00365
00366 mmx_interp_start(u4, u2); mmx_interp_endu();
00367 mmx_interp_start(v4, v2); mmx_interp_endv();
00368 mmx_pack_easy(dst_4, ypt_4);
00369
00370 dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;
00371 ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8;
00372 u1 += 4; u2 += 4; u3 += 4; u4 += 4;
00373 v1 += 4; v2 += 4; v3 += 4; v4 += 4;
00374 }
00375
00376 ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap;
00377 dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;
00378 u1 += uwrap; v1 += vwrap; u2 += uwrap; v2 += vwrap;
00379 u3 += uwrap; v3 += vwrap; u4 += uwrap;v4 += vwrap;
00380 }
00381
00382 emms();
00383
00384 return;
00385 }
00386 #endif //MMX
00387
00388
00389 for (int col = 0; col < width; col += 2)
00390 {
00391 *(dst_1++) = *v1; *(dst_2++) = *v2; *(dst_3++) = *v3; *(dst_4++) = *v4;
00392 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
00393 *(dst_1++) = *u1; *(dst_2++) = *u2; *(dst_3++) = *u3; *(dst_4++) = *u4;
00394 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
00395 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
00396
00397 *(dst_1++) = *(v1++); *(dst_2++) = *(v2++);
00398 *(dst_3++) = *(v3++); *(dst_4++) = *(v4++);
00399 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
00400 *(dst_1++) = *(u1++); *(dst_2++) = *(u2++);
00401 *(dst_3++) = *(u3++); *(dst_4++) = *(u4++);
00402 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
00403 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
00404 }
00405
00406 ypt_1 += ywrap; ypt_2 += ywrap;
00407 dst_1 += bgra_width; dst_2 += bgra_width;
00408
00409 ypt_3 = ypt_2 + pitches[0];
00410 ypt_4 = ypt_3 + pitches[0];
00411 dst_3 = dst_2 + bgra_width;
00412 dst_4 = dst_3 + bgra_width;
00413
00414 ywrap = (pitches[0] << 2) - width;
00415
00416 u1 = (uint8_t *)source + offsets[1];
00417 v1 = (uint8_t *)source + offsets[2];
00418 u2 = u1 + pitches[1]; v2 = v1 + pitches[2];
00419 u3 = u2 + pitches[1]; v3 = v2 + pitches[2];
00420 u4 = u3 + pitches[1]; v4 = v3 + pitches[2];
00421
00422 height -= 4;
00423
00424 uint8_t v[4], u[4];
00425
00426
00427 for (int row = 0; row < height; row += 4)
00428 {
00429 for (int col = 0; col < width; col += 2)
00430 {
00431 c_interp(v, v1, v2, v3, v4);
00432 c_interp(u, u1, u2, u3, u4);
00433
00434 *(dst_1++) = v[0]; *(dst_2++) = v[1];
00435 *(dst_3++) = v[2]; *(dst_4++) = v[3];
00436 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
00437 *(dst_1++) = u[0]; *(dst_2++) = u[1];
00438 *(dst_3++) = u[2]; *(dst_4++) = u[3];
00439 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
00440 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
00441
00442 *(dst_1++) = v[0]; *(dst_2++) = v[1];
00443 *(dst_3++) = v[2]; *(dst_4++) = v[3];
00444 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
00445 *(dst_1++) = u[0]; *(dst_2++) = u[1];
00446 *(dst_3++) = u[2]; *(dst_4++) = u[3];
00447 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
00448 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
00449
00450 v1++; v2++; v3++; v4++;
00451 u1++; u2++; u3++; u4++;
00452 }
00453 ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap;
00454 u1 += uwrap; u2 += uwrap; u3 += uwrap; u4 += uwrap;
00455 v1 += vwrap; v2 += vwrap; v3 += vwrap; v4 += vwrap;
00456 dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;
00457 }
00458 }