00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "dsputil.h"
00026 #include "mpegvideo.h"
00027 #include "avcodec.h"
00028 #include "x86_cpu.h"
00029
00030 extern uint16_t inv_zigzag_direct16[64];
00031
00032 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
00033 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
00034
00035
00036 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00037 DCTELEM *block, int n, int qscale)
00038 {
00039 long level, qmul, qadd, nCoeffs;
00040
00041 qmul = qscale << 1;
00042
00043 assert(s->block_last_index[n]>=0 || s->h263_aic);
00044
00045 if (!s->h263_aic) {
00046 if (n < 4)
00047 level = block[0] * s->y_dc_scale;
00048 else
00049 level = block[0] * s->c_dc_scale;
00050 qadd = (qscale - 1) | 1;
00051 }else{
00052 qadd = 0;
00053 level= block[0];
00054 }
00055 if(s->ac_pred)
00056 nCoeffs=63;
00057 else
00058 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00059
00060 asm volatile(
00061 "movd %1, %%mm6 \n\t"
00062 "packssdw %%mm6, %%mm6 \n\t"
00063 "packssdw %%mm6, %%mm6 \n\t"
00064 "movd %2, %%mm5 \n\t"
00065 "pxor %%mm7, %%mm7 \n\t"
00066 "packssdw %%mm5, %%mm5 \n\t"
00067 "packssdw %%mm5, %%mm5 \n\t"
00068 "psubw %%mm5, %%mm7 \n\t"
00069 "pxor %%mm4, %%mm4 \n\t"
00070 ASMALIGN(4)
00071 "1: \n\t"
00072 "movq (%0, %3), %%mm0 \n\t"
00073 "movq 8(%0, %3), %%mm1 \n\t"
00074
00075 "pmullw %%mm6, %%mm0 \n\t"
00076 "pmullw %%mm6, %%mm1 \n\t"
00077
00078 "movq (%0, %3), %%mm2 \n\t"
00079 "movq 8(%0, %3), %%mm3 \n\t"
00080
00081 "pcmpgtw %%mm4, %%mm2 \n\t"
00082 "pcmpgtw %%mm4, %%mm3 \n\t"
00083
00084 "pxor %%mm2, %%mm0 \n\t"
00085 "pxor %%mm3, %%mm1 \n\t"
00086
00087 "paddw %%mm7, %%mm0 \n\t"
00088 "paddw %%mm7, %%mm1 \n\t"
00089
00090 "pxor %%mm0, %%mm2 \n\t"
00091 "pxor %%mm1, %%mm3 \n\t"
00092
00093 "pcmpeqw %%mm7, %%mm0 \n\t"
00094 "pcmpeqw %%mm7, %%mm1 \n\t"
00095
00096 "pandn %%mm2, %%mm0 \n\t"
00097 "pandn %%mm3, %%mm1 \n\t"
00098
00099 "movq %%mm0, (%0, %3) \n\t"
00100 "movq %%mm1, 8(%0, %3) \n\t"
00101
00102 "add $16, %3 \n\t"
00103 "jng 1b \n\t"
00104 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00105 : "memory"
00106 );
00107 block[0]= level;
00108 }
00109
00110
00111 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00112 DCTELEM *block, int n, int qscale)
00113 {
00114 long qmul, qadd, nCoeffs;
00115
00116 qmul = qscale << 1;
00117 qadd = (qscale - 1) | 1;
00118
00119 assert(s->block_last_index[n]>=0 || s->h263_aic);
00120
00121 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00122
00123 asm volatile(
00124 "movd %1, %%mm6 \n\t"
00125 "packssdw %%mm6, %%mm6 \n\t"
00126 "packssdw %%mm6, %%mm6 \n\t"
00127 "movd %2, %%mm5 \n\t"
00128 "pxor %%mm7, %%mm7 \n\t"
00129 "packssdw %%mm5, %%mm5 \n\t"
00130 "packssdw %%mm5, %%mm5 \n\t"
00131 "psubw %%mm5, %%mm7 \n\t"
00132 "pxor %%mm4, %%mm4 \n\t"
00133 ASMALIGN(4)
00134 "1: \n\t"
00135 "movq (%0, %3), %%mm0 \n\t"
00136 "movq 8(%0, %3), %%mm1 \n\t"
00137
00138 "pmullw %%mm6, %%mm0 \n\t"
00139 "pmullw %%mm6, %%mm1 \n\t"
00140
00141 "movq (%0, %3), %%mm2 \n\t"
00142 "movq 8(%0, %3), %%mm3 \n\t"
00143
00144 "pcmpgtw %%mm4, %%mm2 \n\t"
00145 "pcmpgtw %%mm4, %%mm3 \n\t"
00146
00147 "pxor %%mm2, %%mm0 \n\t"
00148 "pxor %%mm3, %%mm1 \n\t"
00149
00150 "paddw %%mm7, %%mm0 \n\t"
00151 "paddw %%mm7, %%mm1 \n\t"
00152
00153 "pxor %%mm0, %%mm2 \n\t"
00154 "pxor %%mm1, %%mm3 \n\t"
00155
00156 "pcmpeqw %%mm7, %%mm0 \n\t"
00157 "pcmpeqw %%mm7, %%mm1 \n\t"
00158
00159 "pandn %%mm2, %%mm0 \n\t"
00160 "pandn %%mm3, %%mm1 \n\t"
00161
00162 "movq %%mm0, (%0, %3) \n\t"
00163 "movq %%mm1, 8(%0, %3) \n\t"
00164
00165 "add $16, %3 \n\t"
00166 "jng 1b \n\t"
00167 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00168 : "memory"
00169 );
00170 }
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00203 DCTELEM *block, int n, int qscale)
00204 {
00205 long nCoeffs;
00206 const uint16_t *quant_matrix;
00207 int block0;
00208
00209 assert(s->block_last_index[n]>=0);
00210
00211 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00212
00213 if (n < 4)
00214 block0 = block[0] * s->y_dc_scale;
00215 else
00216 block0 = block[0] * s->c_dc_scale;
00217
00218 quant_matrix = s->intra_matrix;
00219 asm volatile(
00220 "pcmpeqw %%mm7, %%mm7 \n\t"
00221 "psrlw $15, %%mm7 \n\t"
00222 "movd %2, %%mm6 \n\t"
00223 "packssdw %%mm6, %%mm6 \n\t"
00224 "packssdw %%mm6, %%mm6 \n\t"
00225 "mov %3, %%"REG_a" \n\t"
00226 ASMALIGN(4)
00227 "1: \n\t"
00228 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00229 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00230 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00231 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00232 "pmullw %%mm6, %%mm4 \n\t"
00233 "pmullw %%mm6, %%mm5 \n\t"
00234 "pxor %%mm2, %%mm2 \n\t"
00235 "pxor %%mm3, %%mm3 \n\t"
00236 "pcmpgtw %%mm0, %%mm2 \n\t"
00237 "pcmpgtw %%mm1, %%mm3 \n\t"
00238 "pxor %%mm2, %%mm0 \n\t"
00239 "pxor %%mm3, %%mm1 \n\t"
00240 "psubw %%mm2, %%mm0 \n\t"
00241 "psubw %%mm3, %%mm1 \n\t"
00242 "pmullw %%mm4, %%mm0 \n\t"
00243 "pmullw %%mm5, %%mm1 \n\t"
00244 "pxor %%mm4, %%mm4 \n\t"
00245 "pxor %%mm5, %%mm5 \n\t"
00246 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00247 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00248 "psraw $3, %%mm0 \n\t"
00249 "psraw $3, %%mm1 \n\t"
00250 "psubw %%mm7, %%mm0 \n\t"
00251 "psubw %%mm7, %%mm1 \n\t"
00252 "por %%mm7, %%mm0 \n\t"
00253 "por %%mm7, %%mm1 \n\t"
00254 "pxor %%mm2, %%mm0 \n\t"
00255 "pxor %%mm3, %%mm1 \n\t"
00256 "psubw %%mm2, %%mm0 \n\t"
00257 "psubw %%mm3, %%mm1 \n\t"
00258 "pandn %%mm0, %%mm4 \n\t"
00259 "pandn %%mm1, %%mm5 \n\t"
00260 "movq %%mm4, (%0, %%"REG_a") \n\t"
00261 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00262
00263 "add $16, %%"REG_a" \n\t"
00264 "js 1b \n\t"
00265 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00266 : "%"REG_a, "memory"
00267 );
00268 block[0]= block0;
00269 }
00270
00271 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00272 DCTELEM *block, int n, int qscale)
00273 {
00274 long nCoeffs;
00275 const uint16_t *quant_matrix;
00276
00277 assert(s->block_last_index[n]>=0);
00278
00279 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00280
00281 quant_matrix = s->inter_matrix;
00282 asm volatile(
00283 "pcmpeqw %%mm7, %%mm7 \n\t"
00284 "psrlw $15, %%mm7 \n\t"
00285 "movd %2, %%mm6 \n\t"
00286 "packssdw %%mm6, %%mm6 \n\t"
00287 "packssdw %%mm6, %%mm6 \n\t"
00288 "mov %3, %%"REG_a" \n\t"
00289 ASMALIGN(4)
00290 "1: \n\t"
00291 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00292 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00293 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00294 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00295 "pmullw %%mm6, %%mm4 \n\t"
00296 "pmullw %%mm6, %%mm5 \n\t"
00297 "pxor %%mm2, %%mm2 \n\t"
00298 "pxor %%mm3, %%mm3 \n\t"
00299 "pcmpgtw %%mm0, %%mm2 \n\t"
00300 "pcmpgtw %%mm1, %%mm3 \n\t"
00301 "pxor %%mm2, %%mm0 \n\t"
00302 "pxor %%mm3, %%mm1 \n\t"
00303 "psubw %%mm2, %%mm0 \n\t"
00304 "psubw %%mm3, %%mm1 \n\t"
00305 "paddw %%mm0, %%mm0 \n\t"
00306 "paddw %%mm1, %%mm1 \n\t"
00307 "paddw %%mm7, %%mm0 \n\t"
00308 "paddw %%mm7, %%mm1 \n\t"
00309 "pmullw %%mm4, %%mm0 \n\t"
00310 "pmullw %%mm5, %%mm1 \n\t"
00311 "pxor %%mm4, %%mm4 \n\t"
00312 "pxor %%mm5, %%mm5 \n\t"
00313 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00314 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00315 "psraw $4, %%mm0 \n\t"
00316 "psraw $4, %%mm1 \n\t"
00317 "psubw %%mm7, %%mm0 \n\t"
00318 "psubw %%mm7, %%mm1 \n\t"
00319 "por %%mm7, %%mm0 \n\t"
00320 "por %%mm7, %%mm1 \n\t"
00321 "pxor %%mm2, %%mm0 \n\t"
00322 "pxor %%mm3, %%mm1 \n\t"
00323 "psubw %%mm2, %%mm0 \n\t"
00324 "psubw %%mm3, %%mm1 \n\t"
00325 "pandn %%mm0, %%mm4 \n\t"
00326 "pandn %%mm1, %%mm5 \n\t"
00327 "movq %%mm4, (%0, %%"REG_a") \n\t"
00328 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00329
00330 "add $16, %%"REG_a" \n\t"
00331 "js 1b \n\t"
00332 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00333 : "%"REG_a, "memory"
00334 );
00335 }
00336
00337 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00338 DCTELEM *block, int n, int qscale)
00339 {
00340 long nCoeffs;
00341 const uint16_t *quant_matrix;
00342 int block0;
00343
00344 assert(s->block_last_index[n]>=0);
00345
00346 if(s->alternate_scan) nCoeffs= 63;
00347 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00348
00349 if (n < 4)
00350 block0 = block[0] * s->y_dc_scale;
00351 else
00352 block0 = block[0] * s->c_dc_scale;
00353 quant_matrix = s->intra_matrix;
00354 asm volatile(
00355 "pcmpeqw %%mm7, %%mm7 \n\t"
00356 "psrlw $15, %%mm7 \n\t"
00357 "movd %2, %%mm6 \n\t"
00358 "packssdw %%mm6, %%mm6 \n\t"
00359 "packssdw %%mm6, %%mm6 \n\t"
00360 "mov %3, %%"REG_a" \n\t"
00361 ASMALIGN(4)
00362 "1: \n\t"
00363 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00364 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00365 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00366 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00367 "pmullw %%mm6, %%mm4 \n\t"
00368 "pmullw %%mm6, %%mm5 \n\t"
00369 "pxor %%mm2, %%mm2 \n\t"
00370 "pxor %%mm3, %%mm3 \n\t"
00371 "pcmpgtw %%mm0, %%mm2 \n\t"
00372 "pcmpgtw %%mm1, %%mm3 \n\t"
00373 "pxor %%mm2, %%mm0 \n\t"
00374 "pxor %%mm3, %%mm1 \n\t"
00375 "psubw %%mm2, %%mm0 \n\t"
00376 "psubw %%mm3, %%mm1 \n\t"
00377 "pmullw %%mm4, %%mm0 \n\t"
00378 "pmullw %%mm5, %%mm1 \n\t"
00379 "pxor %%mm4, %%mm4 \n\t"
00380 "pxor %%mm5, %%mm5 \n\t"
00381 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00382 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00383 "psraw $3, %%mm0 \n\t"
00384 "psraw $3, %%mm1 \n\t"
00385 "pxor %%mm2, %%mm0 \n\t"
00386 "pxor %%mm3, %%mm1 \n\t"
00387 "psubw %%mm2, %%mm0 \n\t"
00388 "psubw %%mm3, %%mm1 \n\t"
00389 "pandn %%mm0, %%mm4 \n\t"
00390 "pandn %%mm1, %%mm5 \n\t"
00391 "movq %%mm4, (%0, %%"REG_a") \n\t"
00392 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00393
00394 "add $16, %%"REG_a" \n\t"
00395 "jng 1b \n\t"
00396 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00397 : "%"REG_a, "memory"
00398 );
00399 block[0]= block0;
00400
00401 }
00402
00403 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00404 DCTELEM *block, int n, int qscale)
00405 {
00406 long nCoeffs;
00407 const uint16_t *quant_matrix;
00408
00409 assert(s->block_last_index[n]>=0);
00410
00411 if(s->alternate_scan) nCoeffs= 63;
00412 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00413
00414 quant_matrix = s->inter_matrix;
00415 asm volatile(
00416 "pcmpeqw %%mm7, %%mm7 \n\t"
00417 "psrlq $48, %%mm7 \n\t"
00418 "movd %2, %%mm6 \n\t"
00419 "packssdw %%mm6, %%mm6 \n\t"
00420 "packssdw %%mm6, %%mm6 \n\t"
00421 "mov %3, %%"REG_a" \n\t"
00422 ASMALIGN(4)
00423 "1: \n\t"
00424 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00425 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00426 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00427 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00428 "pmullw %%mm6, %%mm4 \n\t"
00429 "pmullw %%mm6, %%mm5 \n\t"
00430 "pxor %%mm2, %%mm2 \n\t"
00431 "pxor %%mm3, %%mm3 \n\t"
00432 "pcmpgtw %%mm0, %%mm2 \n\t"
00433 "pcmpgtw %%mm1, %%mm3 \n\t"
00434 "pxor %%mm2, %%mm0 \n\t"
00435 "pxor %%mm3, %%mm1 \n\t"
00436 "psubw %%mm2, %%mm0 \n\t"
00437 "psubw %%mm3, %%mm1 \n\t"
00438 "paddw %%mm0, %%mm0 \n\t"
00439 "paddw %%mm1, %%mm1 \n\t"
00440 "pmullw %%mm4, %%mm0 \n\t"
00441 "pmullw %%mm5, %%mm1 \n\t"
00442 "paddw %%mm4, %%mm0 \n\t"
00443 "paddw %%mm5, %%mm1 \n\t"
00444 "pxor %%mm4, %%mm4 \n\t"
00445 "pxor %%mm5, %%mm5 \n\t"
00446 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00447 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00448 "psrlw $4, %%mm0 \n\t"
00449 "psrlw $4, %%mm1 \n\t"
00450 "pxor %%mm2, %%mm0 \n\t"
00451 "pxor %%mm3, %%mm1 \n\t"
00452 "psubw %%mm2, %%mm0 \n\t"
00453 "psubw %%mm3, %%mm1 \n\t"
00454 "pandn %%mm0, %%mm4 \n\t"
00455 "pandn %%mm1, %%mm5 \n\t"
00456 "pxor %%mm4, %%mm7 \n\t"
00457 "pxor %%mm5, %%mm7 \n\t"
00458 "movq %%mm4, (%0, %%"REG_a") \n\t"
00459 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00460
00461 "add $16, %%"REG_a" \n\t"
00462 "jng 1b \n\t"
00463 "movd 124(%0, %3), %%mm0 \n\t"
00464 "movq %%mm7, %%mm6 \n\t"
00465 "psrlq $32, %%mm7 \n\t"
00466 "pxor %%mm6, %%mm7 \n\t"
00467 "movq %%mm7, %%mm6 \n\t"
00468 "psrlq $16, %%mm7 \n\t"
00469 "pxor %%mm6, %%mm7 \n\t"
00470 "pslld $31, %%mm7 \n\t"
00471 "psrlq $15, %%mm7 \n\t"
00472 "pxor %%mm7, %%mm0 \n\t"
00473 "movd %%mm0, 124(%0, %3) \n\t"
00474
00475 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
00476 : "%"REG_a, "memory"
00477 );
00478 }
00479
00480
00481
00482 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00483 {
00484 uint8_t *ptr, *last_line;
00485 int i;
00486
00487 last_line = buf + (height - 1) * wrap;
00488
00489 ptr = buf;
00490 if(w==8)
00491 {
00492 asm volatile(
00493 "1: \n\t"
00494 "movd (%0), %%mm0 \n\t"
00495 "punpcklbw %%mm0, %%mm0 \n\t"
00496 "punpcklwd %%mm0, %%mm0 \n\t"
00497 "punpckldq %%mm0, %%mm0 \n\t"
00498 "movq %%mm0, -8(%0) \n\t"
00499 "movq -8(%0, %2), %%mm1 \n\t"
00500 "punpckhbw %%mm1, %%mm1 \n\t"
00501 "punpckhwd %%mm1, %%mm1 \n\t"
00502 "punpckhdq %%mm1, %%mm1 \n\t"
00503 "movq %%mm1, (%0, %2) \n\t"
00504 "add %1, %0 \n\t"
00505 "cmp %3, %0 \n\t"
00506 " jb 1b \n\t"
00507 : "+r" (ptr)
00508 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00509 );
00510 }
00511 else
00512 {
00513 asm volatile(
00514 "1: \n\t"
00515 "movd (%0), %%mm0 \n\t"
00516 "punpcklbw %%mm0, %%mm0 \n\t"
00517 "punpcklwd %%mm0, %%mm0 \n\t"
00518 "punpckldq %%mm0, %%mm0 \n\t"
00519 "movq %%mm0, -8(%0) \n\t"
00520 "movq %%mm0, -16(%0) \n\t"
00521 "movq -8(%0, %2), %%mm1 \n\t"
00522 "punpckhbw %%mm1, %%mm1 \n\t"
00523 "punpckhwd %%mm1, %%mm1 \n\t"
00524 "punpckhdq %%mm1, %%mm1 \n\t"
00525 "movq %%mm1, (%0, %2) \n\t"
00526 "movq %%mm1, 8(%0, %2) \n\t"
00527 "add %1, %0 \n\t"
00528 "cmp %3, %0 \n\t"
00529 " jb 1b \n\t"
00530 : "+r" (ptr)
00531 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00532 );
00533 }
00534
00535 for(i=0;i<w;i+=4) {
00536
00537 ptr= buf - (i + 1) * wrap - w;
00538 asm volatile(
00539 "1: \n\t"
00540 "movq (%1, %0), %%mm0 \n\t"
00541 "movq %%mm0, (%0) \n\t"
00542 "movq %%mm0, (%0, %2) \n\t"
00543 "movq %%mm0, (%0, %2, 2) \n\t"
00544 "movq %%mm0, (%0, %3) \n\t"
00545 "add $8, %0 \n\t"
00546 "cmp %4, %0 \n\t"
00547 " jb 1b \n\t"
00548 : "+r" (ptr)
00549 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
00550 );
00551 ptr= last_line + (i + 1) * wrap - w;
00552 asm volatile(
00553 "1: \n\t"
00554 "movq (%1, %0), %%mm0 \n\t"
00555 "movq %%mm0, (%0) \n\t"
00556 "movq %%mm0, (%0, %2) \n\t"
00557 "movq %%mm0, (%0, %2, 2) \n\t"
00558 "movq %%mm0, (%0, %3) \n\t"
00559 "add $8, %0 \n\t"
00560 "cmp %4, %0 \n\t"
00561 " jb 1b \n\t"
00562 : "+r" (ptr)
00563 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
00564 );
00565 }
00566 }
00567
00568 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00569 const int intra= s->mb_intra;
00570 int *sum= s->dct_error_sum[intra];
00571 uint16_t *offset= s->dct_offset[intra];
00572
00573 s->dct_count[intra]++;
00574
00575 asm volatile(
00576 "pxor %%mm7, %%mm7 \n\t"
00577 "1: \n\t"
00578 "pxor %%mm0, %%mm0 \n\t"
00579 "pxor %%mm1, %%mm1 \n\t"
00580 "movq (%0), %%mm2 \n\t"
00581 "movq 8(%0), %%mm3 \n\t"
00582 "pcmpgtw %%mm2, %%mm0 \n\t"
00583 "pcmpgtw %%mm3, %%mm1 \n\t"
00584 "pxor %%mm0, %%mm2 \n\t"
00585 "pxor %%mm1, %%mm3 \n\t"
00586 "psubw %%mm0, %%mm2 \n\t"
00587 "psubw %%mm1, %%mm3 \n\t"
00588 "movq %%mm2, %%mm4 \n\t"
00589 "movq %%mm3, %%mm5 \n\t"
00590 "psubusw (%2), %%mm2 \n\t"
00591 "psubusw 8(%2), %%mm3 \n\t"
00592 "pxor %%mm0, %%mm2 \n\t"
00593 "pxor %%mm1, %%mm3 \n\t"
00594 "psubw %%mm0, %%mm2 \n\t"
00595 "psubw %%mm1, %%mm3 \n\t"
00596 "movq %%mm2, (%0) \n\t"
00597 "movq %%mm3, 8(%0) \n\t"
00598 "movq %%mm4, %%mm2 \n\t"
00599 "movq %%mm5, %%mm3 \n\t"
00600 "punpcklwd %%mm7, %%mm4 \n\t"
00601 "punpckhwd %%mm7, %%mm2 \n\t"
00602 "punpcklwd %%mm7, %%mm5 \n\t"
00603 "punpckhwd %%mm7, %%mm3 \n\t"
00604 "paddd (%1), %%mm4 \n\t"
00605 "paddd 8(%1), %%mm2 \n\t"
00606 "paddd 16(%1), %%mm5 \n\t"
00607 "paddd 24(%1), %%mm3 \n\t"
00608 "movq %%mm4, (%1) \n\t"
00609 "movq %%mm2, 8(%1) \n\t"
00610 "movq %%mm5, 16(%1) \n\t"
00611 "movq %%mm3, 24(%1) \n\t"
00612 "add $16, %0 \n\t"
00613 "add $32, %1 \n\t"
00614 "add $16, %2 \n\t"
00615 "cmp %3, %0 \n\t"
00616 " jb 1b \n\t"
00617 : "+r" (block), "+r" (sum), "+r" (offset)
00618 : "r"(block+64)
00619 );
00620 }
00621
00622 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00623 const int intra= s->mb_intra;
00624 int *sum= s->dct_error_sum[intra];
00625 uint16_t *offset= s->dct_offset[intra];
00626
00627 s->dct_count[intra]++;
00628
00629 asm volatile(
00630 "pxor %%xmm7, %%xmm7 \n\t"
00631 "1: \n\t"
00632 "pxor %%xmm0, %%xmm0 \n\t"
00633 "pxor %%xmm1, %%xmm1 \n\t"
00634 "movdqa (%0), %%xmm2 \n\t"
00635 "movdqa 16(%0), %%xmm3 \n\t"
00636 "pcmpgtw %%xmm2, %%xmm0 \n\t"
00637 "pcmpgtw %%xmm3, %%xmm1 \n\t"
00638 "pxor %%xmm0, %%xmm2 \n\t"
00639 "pxor %%xmm1, %%xmm3 \n\t"
00640 "psubw %%xmm0, %%xmm2 \n\t"
00641 "psubw %%xmm1, %%xmm3 \n\t"
00642 "movdqa %%xmm2, %%xmm4 \n\t"
00643 "movdqa %%xmm3, %%xmm5 \n\t"
00644 "psubusw (%2), %%xmm2 \n\t"
00645 "psubusw 16(%2), %%xmm3 \n\t"
00646 "pxor %%xmm0, %%xmm2 \n\t"
00647 "pxor %%xmm1, %%xmm3 \n\t"
00648 "psubw %%xmm0, %%xmm2 \n\t"
00649 "psubw %%xmm1, %%xmm3 \n\t"
00650 "movdqa %%xmm2, (%0) \n\t"
00651 "movdqa %%xmm3, 16(%0) \n\t"
00652 "movdqa %%xmm4, %%xmm6 \n\t"
00653 "movdqa %%xmm5, %%xmm0 \n\t"
00654 "punpcklwd %%xmm7, %%xmm4 \n\t"
00655 "punpckhwd %%xmm7, %%xmm6 \n\t"
00656 "punpcklwd %%xmm7, %%xmm5 \n\t"
00657 "punpckhwd %%xmm7, %%xmm0 \n\t"
00658 "paddd (%1), %%xmm4 \n\t"
00659 "paddd 16(%1), %%xmm6 \n\t"
00660 "paddd 32(%1), %%xmm5 \n\t"
00661 "paddd 48(%1), %%xmm0 \n\t"
00662 "movdqa %%xmm4, (%1) \n\t"
00663 "movdqa %%xmm6, 16(%1) \n\t"
00664 "movdqa %%xmm5, 32(%1) \n\t"
00665 "movdqa %%xmm0, 48(%1) \n\t"
00666 "add $32, %0 \n\t"
00667 "add $64, %1 \n\t"
00668 "add $32, %2 \n\t"
00669 "cmp %3, %0 \n\t"
00670 " jb 1b \n\t"
00671 : "+r" (block), "+r" (sum), "+r" (offset)
00672 : "r"(block+64)
00673 );
00674 }
00675
00676 #ifdef HAVE_SSSE3
00677 #define HAVE_SSSE3_BAK
00678 #endif
00679 #undef HAVE_SSSE3
00680
00681 #undef HAVE_SSE2
00682 #undef HAVE_MMX2
00683 #define RENAME(a) a ## _MMX
00684 #define RENAMEl(a) a ## _mmx
00685 #include "mpegvideo_mmx_template.c"
00686
00687 #define HAVE_MMX2
00688 #undef RENAME
00689 #undef RENAMEl
00690 #define RENAME(a) a ## _MMX2
00691 #define RENAMEl(a) a ## _mmx2
00692 #include "mpegvideo_mmx_template.c"
00693
00694 #define HAVE_SSE2
00695 #undef RENAME
00696 #undef RENAMEl
00697 #define RENAME(a) a ## _SSE2
00698 #define RENAMEl(a) a ## _sse2
00699 #include "mpegvideo_mmx_template.c"
00700
00701 #ifdef HAVE_SSSE3_BAK
00702 #define HAVE_SSSE3
00703 #undef RENAME
00704 #undef RENAMEl
00705 #define RENAME(a) a ## _SSSE3
00706 #define RENAMEl(a) a ## _sse2
00707 #include "mpegvideo_mmx_template.c"
00708 #endif
00709
00710 void MPV_common_init_mmx(MpegEncContext *s)
00711 {
00712 if (mm_flags & MM_MMX) {
00713 const int dct_algo = s->avctx->dct_algo;
00714
00715 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00716 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00717 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00718 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00719 if(!(s->flags & CODEC_FLAG_BITEXACT))
00720 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00721 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00722
00723 draw_edges = draw_edges_mmx;
00724
00725 if (mm_flags & MM_SSE2) {
00726 s->denoise_dct= denoise_dct_sse2;
00727 } else {
00728 s->denoise_dct= denoise_dct_mmx;
00729 }
00730
00731 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00732 #ifdef HAVE_SSSE3
00733 if(mm_flags & MM_SSSE3){
00734 s->dct_quantize= dct_quantize_SSSE3;
00735 } else
00736 #endif
00737 if(mm_flags & MM_SSE2){
00738 s->dct_quantize= dct_quantize_SSE2;
00739 } else if(mm_flags & MM_MMXEXT){
00740 s->dct_quantize= dct_quantize_MMX2;
00741 } else {
00742 s->dct_quantize= dct_quantize_MMX;
00743 }
00744 }
00745 }
00746 }