/* Predict.c, motion compensation routines                                    */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

#include "mmx.h"

static mmx_t MMX_MASK_1 = (mmx_t)(long long)0x0101010101010101LL;
static mmx_t MMX_MASK_2 = (mmx_t)(long long)0xFEFEFEFEFEFEFEFELL;
static mmx_t PACKED_0           = (mmx_t)(long long)0x0000000000000000LL;
static mmx_t PACKED_1           = (mmx_t)(long long)0x0001000100010001LL;
static mmx_t PACKED_2           = (mmx_t)(long long)0x0002000200020002LL;

void mpeg2play_form_component_prediction_mmx(src,dst,lx,lx2,w,h,x,y,dx,dy,average_flag)
unsigned char *src;
unsigned char *dst;
int lx;          /* raster line increment */ 
int lx2;
int w,h;
int x,y;
int dx,dy;
int average_flag;      /* flag that signals bi-directional or Dual-Prime 
                          averaging (7.6.7.1 and 7.6.7.4). if average_flag==1,
                          a previously formed prediction has been stored in 
                          pel_pred[] */
{
  int xint;      /* horizontal integer sample vector: analogous to int_vec[0] */
  int yint;      /* vertical integer sample vectors: analogous to int_vec[1] */
  int xh;        /* horizontal half sample flag: analogous to half_flag[0]  */
  int yh;        /* vertical half sample flag: analogous to half_flag[1]  */
  int j;
  unsigned char *s;    /* source pointer: analogous to pel_ref[][]   */
  unsigned char *d;    /* destination pointer:  analogous to pel_pred[][]  */

  /* half pel scaling for integer vectors */
  xint = dx>>1;
  yint = dy>>1;

  /* derive half pel flags */
  xh = dx & 1;
  yh = dy & 1;

  /* compute the linear address of pel_ref[][] and pel_pred[][] 
     based on cartesian/raster cordinates provided */
  s = src + lx*(y+yint) + x + xint;
  d = dst + lx*y + x;

  if (!xh && !yh) /* no horizontal nor vertical half-pel */
  {
    if (average_flag)
    {
      movq_m2r(MMX_MASK_1, mm5);
      movq_m2r(MMX_MASK_2, mm4);
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
          movq_r2r(mm1, mm3);
          movq_m2r(*(d), mm2);
          por_r2r(mm2, mm3);
          pand_r2r(mm5, mm3);
          pand_r2r(mm4,mm1);
          pand_r2r(mm4,mm2);
          psrlq_i2r(1, mm1);
          psrlq_i2r(1, mm2);
          paddusb_r2r(mm2, mm1);
          paddusb_r2r(mm3, mm1);
          movq_r2m(mm1, *(d));
          s+= lx2;
          d+= lx2;
        }
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
          movq_r2r(mm1, mm3);
          movq_m2r(*(d), mm2);
          por_r2r(mm2, mm3);
          pand_r2r(mm5, mm3);
          pand_r2r(mm4,mm1);
          pand_r2r(mm4,mm2);
          psrlq_i2r(1, mm1);
          psrlq_i2r(1, mm2);
          paddusb_r2r(mm2, mm1);
          paddusb_r2r(mm3, mm1);
          movq_r2m(mm1, *(d));

          movq_m2r(*(s+8), mm1);
          movq_r2r(mm1, mm3);
          movq_m2r(*(d+8), mm2);
          por_r2r(mm2, mm3);
          pand_r2r(mm5, mm3);
          pand_r2r(mm4,mm1);
          pand_r2r(mm4,mm2);
          psrlq_i2r(1, mm1);
          psrlq_i2r(1, mm2);
          paddusb_r2r(mm2, mm1);
          paddusb_r2r(mm3, mm1);
          movq_r2m(mm1, *(d+8));
          s+= lx2;
          d+= lx2;
        }
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
          movq_r2m(mm0, *(d));
        
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
          movq_r2m(mm0, *(d));
          movq_m2r(*(s+8), mm0);
          movq_r2m(mm0, *(d+8));
        
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else if (!xh && yh) /* no horizontal but vertical half-pel */
  {
    if (average_flag)
    {
      movq_m2r(MMX_MASK_1, mm5);
      movq_m2r(MMX_MASK_2, mm4);
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d));

          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {

          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d));

          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx+8), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d+8), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8));

          s+= lx2;
          d+= lx2;
	}
      }
    }
    else
    {
      movq_m2r(MMX_MASK_1, mm5);
      movq_m2r(MMX_MASK_2, mm4);
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+lx+8), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8)); 
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else if (xh && !yh) /* horizontal but no vertical half-pel */
  {
    if (average_flag)
    {
      movq_m2r(MMX_MASK_1, mm5);
      movq_m2r(MMX_MASK_2, mm4);
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d));
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d));

          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+9), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 

	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(d+8), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8));

          s+= lx2;
          d+= lx2;
	}
      }
    }
    else
    {
      movq_m2r(MMX_MASK_1, mm5);
      movq_m2r(MMX_MASK_2, mm4);
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+1), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d)); 

          movq_m2r(*(s+8), mm1);
	  movq_r2r(mm1, mm3); 
	  movq_m2r(*(s+9), mm2); 
	  por_r2r(mm2, mm3); 
	  pand_r2r(mm5, mm3);
	  pand_r2r(mm4,mm1); 
	  pand_r2r(mm4,mm2); 
	  psrlq_i2r(1, mm1); 
	  psrlq_i2r(1, mm2); 
	  paddusb_r2r(mm2, mm1); 
	  paddusb_r2r(mm3, mm1); 
	  movq_r2m(mm1, *(d+8)); 
          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  else /* if (xh && yh) horizontal and vertical half-pel */
  {
    if (average_flag)
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
          movq_m2r(*(d), mm1);
	  movq_r2r(mm1, mm5);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpckhbw_m2r(PACKED_0, mm5);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  paddw_m2r(PACKED_1, mm0);
	  paddw_m2r(PACKED_1, mm4);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm5, mm4);
	  psrlw_i2r(1, mm0);
	  psrlw_i2r(1, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
          movq_m2r(*(d), mm1);
	  movq_r2r(mm1, mm5);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpckhbw_m2r(PACKED_0, mm5);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  paddw_m2r(PACKED_1, mm0);
	  paddw_m2r(PACKED_1, mm4);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm5, mm4);
	  psrlw_i2r(1, mm0);
	  psrlw_i2r(1, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));

          movq_m2r(*(s+8), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+9), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx+8), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+9), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
          movq_m2r(*(d+8), mm1);
	  movq_r2r(mm1, mm5);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpckhbw_m2r(PACKED_0, mm5);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  paddw_m2r(PACKED_1, mm0);
	  paddw_m2r(PACKED_1, mm4);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm5, mm4);
	  psrlw_i2r(1, mm0);
	  psrlw_i2r(1, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d+8));

          s+= lx2;
          d+= lx2;
	}
      }
    }
    else
    {
      if (w == 8) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));
          s+= lx2;
          d+= lx2;
	}
      }
      else if (w == 16) {
        for (j=h; j; j--)
        {
          movq_m2r(*(s), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+1), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+1), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d));

          movq_m2r(*(s+8), mm0);
	  movq_r2r(mm0, mm4);
	  movq_m2r(*(s+9), mm1);
	  movq_r2r(mm1, mm5);
	  movq_m2r(*(s+lx+8), mm2);
	  movq_r2r(mm2, mm6);
	  movq_m2r(*(s+lx+9), mm3);
	  movq_r2r(mm3, mm7);
	  punpcklbw_m2r(PACKED_0, mm0);
	  punpcklbw_m2r(PACKED_0, mm1);
	  punpcklbw_m2r(PACKED_0, mm2);
	  punpcklbw_m2r(PACKED_0, mm3);
	  punpckhbw_m2r(PACKED_0, mm4);
	  punpckhbw_m2r(PACKED_0, mm5);
	  punpckhbw_m2r(PACKED_0, mm6);
	  punpckhbw_m2r(PACKED_0, mm7);
	  paddw_r2r(mm1, mm0);
	  paddw_r2r(mm3, mm2);
	  paddw_r2r(mm5, mm4);
	  paddw_r2r(mm7, mm6);
	  paddw_r2r(mm2, mm0);
	  paddw_r2r(mm6, mm4);
	  paddw_m2r(PACKED_2, mm0);
	  paddw_m2r(PACKED_2, mm4);
	  psrlw_i2r(2, mm0);
	  psrlw_i2r(2, mm4);
	  packuswb_r2r(mm4, mm0);
	  movq_r2m(mm0, *(d+8));

          s+= lx2;
          d+= lx2;
	}
      }
    }
  }
  emms();
}
