/* Predict.c, motion compensation routines                                    */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

#include "mmx.h"

static mmx_t PACKED_0           = (mmx_t)(long long)0x0000000000000000LL;
static mmx_t PACKED_2           = (mmx_t)(long long)0x0002000200020002LL;
static mmx_t mpeg_play_mask1 	= (mmx_t)(long long)0x0101010101010101LL;
static mmx_t mpeg_play_val2 	= (mmx_t)(long long)0x0202020202020202LL;
static mmx_t mpeg_play_mask3 	= (mmx_t)(long long)0x0303030303030303LL;
static mmx_t mpeg_play_mask2 	= (mmx_t)(long long)0xFEFEFEFEFEFEFEFELL;
static mmx_t mpeg_play_maskn3 	= (mmx_t)(long long)0xFCFCFCFCFCFCFCFCLL;

void form_component_prediction_mmx(src,dst,blockvals,lx,lx2,w,h,x,y,dx,dy,zflag)
unsigned char *src;
unsigned char *dst;
short int *blockvals;
int lx;          /* raster line increment */ 
int lx2;
int w,h;
int x,y;
int dx,dy;
int zflag;      /* flag that signals bi-directional or Dual-Prime 
                          averaging (7.6.7.1 and 7.6.7.4). if zflag==1,
                          a previously formed prediction has been stored in 
                          pel_pred[] */
{
  int xint;      /* horizontal integer sample vector: analogous to int_vec[0] */
  int yint;      /* vertical integer sample vectors: analogous to int_vec[1] */
  int xh;        /* horizontal half sample flag: analogous to half_flag[0]  */
  int yh;        /* vertical half sample flag: analogous to half_flag[1]  */
  int j;
  unsigned char *s;    /* source pointer: analogous to pel_ref[][]   */
  unsigned char *d;    /* destination pointer:  analogous to pel_pred[][]  */

  /* half pel scaling for integer vectors */
  xint = dx>>1;
  yint = dy>>1;

  /* derive half pel flags */
  xh = dx & 1;
  yh = dy & 1;

  /* compute the linear address of pel_ref[][] and pel_pred[][] 
     based on cartesian/raster cordinates provided */
  s = src + lx*(y+yint) + x + xint;
  d = dst + lx*y + x;

  if (!xh && !yh) /* no horizontal nor vertical half-pel */
  {
    if (!zflag)
    {
      pxor_r2r(mm3, mm3); 
      for (j=h; j; j--)
      {
        movq_m2r(*(s), mm1);
        movq_r2r(mm1, mm2);
        punpcklbw_r2r(mm3, mm1); 
        paddsw_m2r(*(blockvals), mm1); 
        punpckhbw_r2r(mm3, mm2); 
        paddsw_m2r(*((blockvals)+4), mm2); 
        packuswb_r2r(mm2,mm1); 
        movq_r2m(mm1, *(d)); 
	blockvals += 8;
        s+= lx2;
        d+= lx2;
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
          movq_r2m(mm0, *(d));
        
          s+= lx2;
          d+= lx2;
	}
      }
      else {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
          movq_r2m(mm0, *(d));
          movq_m2r(*(s+8), mm0);
          movq_r2m(mm0, *(d+8));
        
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else if (!xh && yh) /* no horizontal but vertical half-pel */
  {
    movq_m2r(mpeg_play_mask1, mm5);
    movq_m2r(mpeg_play_mask2, mm4);
    if (!zflag)
    {
      pxor_r2r(mm7, mm7); 
      for (j=h; j; j--)
      {
        movq_m2r(*(s), mm1);
	movq_r2r(mm1, mm3); 
	movq_m2r(*(s+lx), mm2); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5, mm3);
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 

        movq_r2r(mm1, mm2);
        punpcklbw_r2r(mm7, mm1); 
        paddsw_m2r(*(blockvals), mm1); 
        punpckhbw_r2r(mm7, mm2); 
        paddsw_m2r(*(blockvals+4), mm2); 
        packuswb_r2r(mm2,mm1); 
	movq_r2m(mm1, *(d));

	blockvals += 8;
        s+= lx2;
        d+= lx2;
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          s+= lx2;
          d+= lx2;
	}
      }
      else {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx+8), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8)); 
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else if (xh && !yh) /* horizontal but no vertical half-pel */
  {
    movq_m2r(mpeg_play_mask1, mm5);
    movq_m2r(mpeg_play_mask2, mm4);
    if (!zflag)
    {
      pxor_r2r(mm7, mm7); 
      for (j=h; j; j--)
      {
        movq_m2r(*(s), mm1);
	movq_r2r(mm1, mm3); 
	movq_m2r(*(s+1), mm2); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5, mm3);
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 

        movq_r2r(mm1, mm2);
        punpcklbw_r2r(mm7, mm1); 
        paddsw_m2r(*(blockvals), mm1); 
        punpckhbw_r2r(mm7, mm2); 
        paddsw_m2r(*((blockvals)+4), mm2); 
        packuswb_r2r(mm2,mm1); 
	movq_r2m(mm1, *(d));

	blockvals += 8;
        s+= lx2;
        d+= lx2;
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          s+= lx2;
          d+= lx2;
	}
      }
      else {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 

          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+9), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8)); 
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else /* if (xh && yh) horizontal and vertical half-pel */
  {
    if (!zflag)
    {
      for (j=h; j; j--)
      {
        movq_m2r(*(s), mm0);
	movq_r2r(mm0, mm4);
	movq_m2r(*(s+1), mm1);
	movq_r2r(mm1, mm5);
	movq_m2r(*(s+lx), mm2);
	movq_r2r(mm2, mm6);
	movq_m2r(*(s+lx+1), mm3);
	movq_r2r(mm3, mm7);
	punpcklbw_m2r(PACKED_0, mm0);
	punpcklbw_m2r(PACKED_0, mm1);
	punpcklbw_m2r(PACKED_0, mm2);
	punpcklbw_m2r(PACKED_0, mm3);
	punpckhbw_m2r(PACKED_0, mm4);
	punpckhbw_m2r(PACKED_0, mm5);
	punpckhbw_m2r(PACKED_0, mm6);
	punpckhbw_m2r(PACKED_0, mm7);
	paddw_r2r(mm1, mm0);
	paddw_r2r(mm3, mm2);
	paddw_r2r(mm5, mm4);
	paddw_r2r(mm7, mm6);
	paddw_r2r(mm2, mm0);
	paddw_r2r(mm6, mm4);
	paddw_m2r(PACKED_2, mm0);
	paddw_m2r(PACKED_2, mm4);
	psrlw_i2r(2, mm0);
	psrlw_i2r(2, mm4);
	packuswb_r2r(mm4, mm0);

        pxor_r2r(mm3, mm3); 
        movq_r2r(mm0, mm2);
        punpcklbw_r2r(mm3, mm0); 
        paddsw_m2r(*(blockvals), mm0); 
        punpckhbw_r2r(mm3, mm2); 
        paddsw_m2r(*((blockvals)+4), mm2); 
        packuswb_r2r(mm2,mm0); 
	movq_r2m(mm0, *(d));

	blockvals += 8;

        s+= lx2;
        d+= lx2;
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));
          s+= lx2;
          d+= lx2;
	}
      }
      else {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));

          movq_m2r(*(s+8), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+9), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx+8), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+9), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d+8));

          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
}

void form_component_prediction_bidir_mmx(srcb,srcf,dst,blockvals,lx,lx2,w,h,x,y,dbx,dby,dfx,dfy,zflag)
unsigned char *srcb;
unsigned char *srcf;
unsigned char *dst;
short int *blockvals;
int lx;        
int lx2;
int w,h;
int x,y;
int dbx,dby;
int dfx,dfy;
int zflag;
{
  int xbint, xfint;  
  int ybint, yfint; 
  int xbh, xfh;  
  int ybh, yfh; 
  int j;
  unsigned char *sb,*sf; 
  unsigned char *d;

  /* half pel scaling for integer vectors */
  xbint = dbx>>1;
  ybint = dby>>1;
  xfint = dfx>>1;
  yfint = dfy>>1;

  /* derive half pel flags */
  xbh = dbx & 1;
  ybh = dby & 1;
  xfh = dfx & 1;
  yfh = dfy & 1;

  /* compute the linear address of pel_ref[][] and pel_pred[][] 
     based on cartesian/raster cordinates provided */
  sb = srcb + lx*(y+ybint) + x + xbint;
  sf = srcf + lx*(y+yfint) + x + xfint;

  d = dst + lx*y + x;
	
  if (!xbh && !ybh && !xfh && !yfh) /* no horizontal nor vertical half-pel */
  {
    movq_m2r(mpeg_play_mask1, mm5);
    movq_m2r(mpeg_play_mask2, mm4);
    if (!zflag)
    {
      pxor_r2r(mm7, mm7); 
      for (j=h; j; j--)
      {
        movq_m2r(*(sb), mm1);
	movq_r2r(mm1, mm3); 
	movq_m2r(*(sf), mm2); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5, mm3);
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 

        movq_r2r(mm1, mm2);
        punpcklbw_r2r(mm7, mm1); 
        paddsw_m2r(*(blockvals), mm1); 
        punpckhbw_r2r(mm7, mm2); 
        paddsw_m2r(*(blockvals+4), mm2); 
        packuswb_r2r(mm2,mm1); 
	movq_r2m(mm1, *(d));
      
	blockvals += 8;
        sb+= lx2;
        sf+= lx2;
        d+= lx2;
      }
    }
    else
    {
      for (j=h; j; j--)
      {
        movq_m2r(*(sb), mm1);
	movq_r2r(mm1, mm3); 
	movq_m2r(*(sf), mm2); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5, mm3);
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 
	movq_r2m(mm1, *(d)); 

        if (w == 8) goto next1;

        movq_m2r(*(sb+8), mm1);
	movq_r2r(mm1, mm3); 
	movq_m2r(*(sf+8), mm2); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5, mm3);
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 
	movq_r2m(mm1, *(d+8)); 
next1:
        sb+= lx2;
        sf+= lx2;
        d+= lx2;
      }
    }
  }
  else {
    unsigned char *sfa, *sfb, *sfc, *sba, *sbb, *sbc;

    sfa = sf + xfh;
    sfb = sf + lx*yfh;
    sfc = sfb + xfh;

    sba = sb + xbh;
    sbb = sb + lx*ybh;
    sbc = sbb + xbh;

    if (!zflag)
    {
      movq_m2r(mpeg_play_mask3,mm7); 
      for (j=h; j; j--)
      {

	movq_m2r(mpeg_play_maskn3,mm4); 
	movq_m2r(mpeg_play_val2, mm3); 

	movq_m2r(*(sb), mm1); 
	movq_r2r(mm3, mm5); 
	movq_r2r(mm1, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm1); 
	psrlq_i2r(2, mm1); 
	movq_m2r(*(sba), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbb), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbc), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm1); 

	movq_r2r(mm3, mm5); 
	movq_m2r(*(sf), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	movq_m2r(*(sfa), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfb), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfc), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm2); 

	movq_m2r(mpeg_play_mask2,mm4); 
	movq_m2r(mpeg_play_mask1,mm5); 
	pxor_r2r(mm6, mm6); 
	movq_r2r(mm1, mm3); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5,mm3); 
	pand_r2r(mm4,mm1); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 
	movq_r2r(mm1, mm2); 
	punpcklbw_r2r(mm6, mm1); 
	paddsw_m2r(*(blockvals), mm1); 
	punpckhbw_r2r(mm6, mm2); 
	paddsw_m2r(*(blockvals+4), mm2); 
	packuswb_r2r(mm2,mm1); 
	movq_r2m(mm1, *(d)); 


	blockvals += 8;
        sb+= lx2;
        sba+= lx2;
        sbb+= lx2;
        sbc+= lx2;
        sf+= lx2;
        sfa+= lx2;
        sfb+= lx2;
        sfc+= lx2;
        d+= lx2;
      }
    }
    else
    {
      movq_m2r(mpeg_play_maskn3,mm4); 
      movq_m2r(mpeg_play_mask3,mm7); 
      for (j=h; j; j--)
      {
	movq_m2r(mpeg_play_val2, mm3); 

	movq_m2r(*(sb), mm1); 
	movq_r2r(mm3, mm5); 
	movq_r2r(mm1, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm1); 
	psrlq_i2r(2, mm1); 
	movq_m2r(*(sba), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbb), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbc), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm1); 

	movq_r2r(mm3, mm5); 
	movq_m2r(*(sf), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	movq_m2r(*(sfa), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfb), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfc), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm2); 

	movq_m2r(mpeg_play_mask2,mm6); 
	movq_m2r(mpeg_play_mask1,mm5); 
	movq_r2r(mm1, mm3); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5,mm3); 
	pand_r2r(mm6,mm1); 
	pand_r2r(mm6,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 
	movq_r2m(mm1, *(d)); 

        if (w == 8)  goto next;

	movq_m2r(mpeg_play_val2, mm3); 

	movq_m2r(*(sb+8), mm1); 
	movq_r2r(mm3, mm5); 
	movq_r2r(mm1, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm1); 
	psrlq_i2r(2, mm1); 
	movq_m2r(*(sba+8), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbb+8), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	movq_m2r(*(sbc+8), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	paddusb_r2r(mm2, mm1); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm1); 

	movq_r2r(mm3, mm5); 
	movq_m2r(*(sf+8), mm2); 
	movq_r2r(mm2, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm2); 
	psrlq_i2r(2, mm2); 
	movq_m2r(*(sfa+8), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfb+8), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	movq_m2r(*(sfc+8), mm3); 
	movq_r2r(mm3, mm6); 
	pand_r2r(mm7,mm6); 
	paddusb_r2r(mm6, mm5); 
	pand_r2r(mm4,mm3); 
	psrlq_i2r(2, mm3); 
	paddusb_r2r(mm3, mm2); 
	pand_r2r(mm4,mm5); 
	psrlq_i2r(2, mm5); 
	paddusb_r2r(mm5, mm2); 

	movq_m2r(mpeg_play_mask2,mm6); 
	movq_m2r(mpeg_play_mask1,mm5); 
	movq_r2r(mm1, mm3); 
	por_r2r(mm2, mm3); 
	pand_r2r(mm5,mm3); 
	pand_r2r(mm6,mm1); 
	pand_r2r(mm6,mm2); 
	psrlq_i2r(1, mm1); 
	psrlq_i2r(1, mm2); 
	paddusb_r2r(mm2, mm1); 
	paddusb_r2r(mm3, mm1); 
	movq_r2m(mm1, *(d+8)); 
next:
        sb+= lx2;
        sba+= lx2;
        sbb+= lx2;
        sbc+= lx2;
        sf+= lx2;
        sfa+= lx2;
        sfb+= lx2;
        sfc+= lx2;
        d+= lx2;
      }
    }
  }
}

