/*
  stores heavily used copy functions (makes mmx support easier)
  Copyright (C) 2000  Martin Vogt

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU Library General Public License as published by
  the Free Software Foundation.

  For more information look at the file COPYRIGHT in this package

 */


#include "copyFunctions.h"


#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

/*
 * We use a lookup table to make sure values stay in the 0..255 range.
 * Since this is cropping (ie, x = (x < 0)?0:(x>255)?255:x; ), wee call this
 * table the "crop table".
 * MAX_NEG_CROP is the maximum neg/pos value we can handle.
 */
/*
 * We use a lookup table to make sure values stay in the 0..255 range.
 * Since this is cropping (ie, x = (x < 0)?0:(x>255)?255:x; ), wee call this
 * table the "crop table".
 * MAX_NEG_CROP is the maximum neg/pos value we can handle.
 */

// Compiler cannot allocate too big arrays.


// mmx goodies
static long ADD_1[]    = {0x01010101, 0x01010101};
static long ADDW_1[]   = {0x00010001, 0x00010001};
static long MASK_AND[] = {0x7f7f7f7f, 0x7f7f7f7f};

void dummyCopyFunctions() {
  cout << "ADD_1:"<<ADD_1<<endl;
  cout << "ADDW_1:"<<ADDW_1<<endl;
  cout << "MASK_AND:"<<MASK_AND<<endl;
}

CopyFunctions::CopyFunctions() {
  /* Initialize crop table. */
  cropTbl=new unsigned char[NUM_CROP_ENTRIES];

  int i;

  for (i = (-MAX_NEG_CROP); i < NUM_CROP_ENTRIES - MAX_NEG_CROP; i++) {
    if (i <= 0) {
      cropTbl[i + MAX_NEG_CROP] = 0;
    } else if (i >= 255) {
      cropTbl[i + MAX_NEG_CROP] = 255;
    } else {
      cropTbl[i + MAX_NEG_CROP] = i;
    }
  }
  cm=cropTbl + MAX_NEG_CROP;


  
#ifdef INTEL
  lmmx=mm_support();
#else
  lmmx=false;
  cout << "no INTEL arch- disable MMX in copyFunctions"<<endl;
#endif


}


CopyFunctions::~CopyFunctions() {
  delete cropTbl;
}

void CopyFunctions::startNOFloatSection() {
  // nothing
}


void CopyFunctions::endNOFloatSection() {
#ifdef INTEL
  if (lmmx) {
    emms();
  }
#endif		
}


void CopyFunctions::copy8_byte(unsigned char* source1,
			       unsigned char* dest,int inc) {
  if (lmmx == false) {
    int rr;
    
    for (rr = 0; rr < 8; rr++) {
      memcpy(dest,source1,sizeof(char)*8);
      source1+=inc;
      dest+=inc;
    }    

  } else {
#ifdef INTEL

    int rr=4;

    asm  (
	  "1:\n"
	  "movq   (%0)    ,%%mm0\n"
	  "leal   (%0,%2) ,%0\n"
	  "movq   (%0)    ,%%mm1\n"
	  "leal   (%0,%2) ,%0\n"
	  
	  // Write
	  "movq   %%mm0   ,(%1)\n"
	  "leal   (%1,%2) ,%1\n"
	  "movq   %%mm1   ,(%1)\n"
	  "leal   (%1,%2) ,%1\n"
	  
	  "decl       %3\n"
	  "jnz        1b\n"		  
	  :
	  : "r"(source1),"r"(dest),"r"(inc),"r"(rr)
	  );
#endif
  }
      

}

void CopyFunctions::copy8_word(unsigned short* source1,
			       unsigned short* dest,int inc) {
  int rr;

  // Optimisation is slower, leave it in C
  for (rr = 0; rr < 8; rr++) {
    memcpy(dest,source1,sizeof(short)*8);
    source1+=inc;
    dest+=inc;
  }    

}


 
void CopyFunctions::copy8_src1linear_crop(short* source1,
					  unsigned char* dest,int inc) {

  if (lmmx == false) {
    int rr;
    
    for (rr = 0; rr < 8; rr++) {
      
      dest[0] = cm[source1[0]];
      dest[1] = cm[source1[1]];
      dest[2] = cm[source1[2]];
      dest[3] = cm[source1[3]];
      dest[4] = cm[source1[4]];
      dest[5] = cm[source1[5]];
      dest[6] = cm[source1[6]];
      dest[7] = cm[source1[7]];

      
      dest += inc;
      source1 += 8;
      
    }
  } else {
#ifdef INTEL
    asm volatile (
		  "movq             (%1),%%mm0\n"
		  "packuswb       8(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            16(%1),%%mm0\n"
		  "packuswb       24(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            32(%1),%%mm0\n"
		  "packuswb       40(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"

		  "movq            48(%1),%%mm0\n"
		  "packuswb       56(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            64(%1),%%mm0\n"
		  "packuswb       72(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            80(%1),%%mm0\n"
		  "packuswb       88(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            96(%1),%%mm0\n"
		  "packuswb       104(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  "addl                   %2,%0\n"
		  
		  "movq            112(%1),%%mm0\n"
		  "packuswb       120(%1),%%mm0\n"
		  "movq            %%mm0,(%0)\n"
		  :
		  :"r" (dest), "r" (source1),"r" (inc)
		  );
#endif
  }

}

void CopyFunctions::copy8_div2_nocrop(unsigned char* source1,
				      unsigned char* source2,
				      unsigned char* dest,int inc) {
  if (lmmx == false) {
    int rr;
    for (rr = 0; rr < 8; rr++) {
      
      dest[0] = (int) (source1[0] + source2[0]+1) >> 1;
      dest[1] = (int) (source1[1] + source2[1]+1) >> 1;
      dest[2] = (int) (source1[2] + source2[2]+1) >> 1;
      dest[3] = (int) (source1[3] + source2[3]+1) >> 1;
      dest[4] = (int) (source1[4] + source2[4]+1) >> 1;
      dest[5] = (int) (source1[5] + source2[5]+1) >> 1;
      dest[6] = (int) (source1[6] + source2[6]+1) >> 1;
      dest[7] = (int) (source1[7] + source2[7]+1) >> 1;
      dest += inc;
      source1 += inc;
      source2 += inc;
    }
  } else {
#ifdef INTEL
    int h=8;
    asm volatile (
		  "movq       MASK_AND,  %%mm5\n"
		  "movq       ADD_1,     %%mm6\n"
		  "1:\t"
		  "movq       (%1),   %%mm0\n"      /* 8 s */
		  "movq       (%4),   %%mm1\n"      /* 8 s +lx */
		  "psrlw      $1,%%mm0\n"
		  "psrlw      $1,%%mm1\n"
		  "pand       %%mm5,%%mm0\n"
		  "pand       %%mm5,%%mm1\n"
		  "paddusb    %%mm1,%%mm0\n"
		  "addl       %3,%1\n"
		  "paddusb    %%mm6,%%mm0\n"
		  "addl       %3,%4\n"
		  "movq       %%mm0,(%2)\n"
		  "decl       %0\n"
		  "leal       (%2, %3), %2\n"
		  "jnz        1b\n"
		  :
		  : "c"(h), "r"(source1), "r"(dest), "r"(inc), "r"(source2)
		  );
#endif
  }

}

void CopyFunctions::copy8_div2_destlinear_nocrop(unsigned char* source1,
						 unsigned char* source2,
						 unsigned char* dest,int inc) {
  
  if (lmmx == false) {
    int rr;
    for (rr = 0; rr < 8; rr++) {
      dest[0] = (int) (source1[0] + source2[0]) >> 1;
      dest[1] = (int) (source1[1] + source2[1]) >> 1;
      dest[2] = (int) (source1[2] + source2[2]) >> 1;
      dest[3] = (int) (source1[3] + source2[3]) >> 1;
      dest[4] = (int) (source1[4] + source2[4]) >> 1;
      dest[5] = (int) (source1[5] + source2[5]) >> 1;
      dest[6] = (int) (source1[6] + source2[6]) >> 1;
      dest[7] = (int) (source1[7] + source2[7]) >> 1;
      dest += 8;
      source1 += inc;
      source2 += inc;
    }
  } else {
#ifdef INTEL
    int h=8;
    asm volatile (
		  "movq       MASK_AND,  %%mm5\n"
		  "1:\t"
		  "movq       (%1),   %%mm0\n"      /* 8 s */
		  "movq       (%4),   %%mm1\n"      /* 8 s +lx */
		  "psrlw      $1,%%mm0\n"
		  "psrlw      $1,%%mm1\n"
		  "pand       %%mm5,%%mm0\n"
		  "pand       %%mm5,%%mm1\n"
		  "paddusb    %%mm1,%%mm0\n"
		  "addl       %3,%1\n"
		  "addl       %3,%4\n"
		  "movq       %%mm0,(%2)\n"
		  "decl       %0\n"
		  "leal       8(%2), %2\n"
		  "jnz        1b\n"
		  :
		  : "c"(h), "r"(source1), "r"(dest), "r"(inc), "r"(source2)
		  );
#endif
  }
}
  

void CopyFunctions::copy16_div2_destlinear_nocrop(unsigned char* source1,
						  unsigned char* source2,
						  unsigned char* dest,int inc){

  if (lmmx == false) {
    int rr;
    for (rr = 0; rr < 16; rr++) {
      dest[0] = (int) (source1[0] + source2[0]) >> 1;
      dest[1] = (int) (source1[1] + source2[1]) >> 1;
      dest[2] = (int) (source1[2] + source2[2]) >> 1;
      dest[3] = (int) (source1[3] + source2[3]) >> 1;
      dest[4] = (int) (source1[4] + source2[4]) >> 1;
      dest[5] = (int) (source1[5] + source2[5]) >> 1;
      dest[6] = (int) (source1[6] + source2[6]) >> 1;
      dest[7] = (int) (source1[7] + source2[7]) >> 1;
      dest[8] = (int) (source1[8] + source2[8]) >> 1;
      dest[9] = (int) (source1[9] + source2[9]) >> 1;
      dest[10] = (int) (source1[10] + source2[10]) >> 1;
      dest[11] = (int) (source1[11] + source2[11]) >> 1;
      dest[12] = (int) (source1[12] + source2[12]) >> 1;
      dest[13] = (int) (source1[13] + source2[13]) >> 1;
      dest[14] = (int) (source1[14] + source2[14]) >> 1;
      dest[15] = (int) (source1[15] + source2[15]) >> 1;
      dest += 16;
      source1 += inc;
      source2 += inc;
    }
  } else {
 #ifdef INTEL
    int h=16;
    inc=inc-8;
    asm volatile (
		  "movq       MASK_AND,  %%mm5\n"
		  "1:\t"
		  "movq       (%1),   %%mm0\n"      /* 8 s */
		  "movq       (%4),   %%mm1\n"      /* 8 s +lx */
		  "psrlw      $1,%%mm0\n"
		  "psrlw      $1,%%mm1\n"
		  "pand       %%mm5,%%mm0\n"
		  "pand       %%mm5,%%mm1\n"
		  "paddusb    %%mm1,%%mm0\n"
		  "leal       8(%1),%1\n"
		  "leal       8(%4),%4\n"
		  "movq       %%mm0,(%2)\n"
		  "leal       8(%2),%2\n"

		  "movq       (%1),   %%mm0\n"      /* 8 s */
		  "movq       (%4),   %%mm1\n"      /* 8 s +lx */
		  "psrlw      $1,%%mm0\n"
		  "psrlw      $1,%%mm1\n"
		  "pand       %%mm5,%%mm0\n"
		  "pand       %%mm5,%%mm1\n"
		  "paddusb    %%mm1,%%mm0\n"
		  "leal       (%3,%1),%1\n"
		  "leal       (%3,%4),%4\n"
		  "movq       %%mm0,(%2)\n"
		  "leal       8(%2),%2\n"
		  
		  "decl       %0\n"
		  "jnz        1b\n"
		  :
		  : "c"(h), "r"(source1), "r"(dest), "r"(inc), "r"(source2)
		  );
#endif
  }   

}
  


void CopyFunctions::copy8_div4_nocrop(unsigned char* source1,
				      unsigned char* source2,
				      unsigned char* source3,
				      unsigned char* source4,
				      unsigned char* dest,int inc) {
  int rr;

  for (rr = 0; rr < 8; rr++) {
    dest[0]=(int) (source1[0]+source2[0]+source3[0]+source4[0] + 2) >> 2;
    dest[1]=(int) (source1[1]+source2[1]+source3[1]+source4[1] + 2) >> 2;
    dest[2]=(int) (source1[2]+source2[2]+source3[2]+source4[2] + 2) >> 2;
    dest[3]=(int) (source1[3]+source2[3]+source3[3]+source4[3] + 2) >> 2;
    dest[4]=(int) (source1[4]+source2[4]+source3[4]+source4[4] + 2) >> 2;
    dest[5]=(int) (source1[5]+source2[5]+source3[5]+source4[5] + 2) >> 2;
    dest[6]=(int) (source1[6]+source2[6]+source3[6]+source4[6] + 2) >> 2;
    dest[7]=(int) (source1[7]+source2[7]+source3[7]+source4[7] + 2) >> 2;
    dest += inc;
    source1 += inc;
    source2 += inc;
    source3 += inc;
    source4 += inc;
  }
}

// Optimize me!
// should be mmx perfomance analysis shows: 8 % overall time

void CopyFunctions::copy8_src2linear_crop(unsigned char* source1,
					  short int* source2,
					  unsigned char* dest,int inc) {
  int rr;
  if (lmmx == false) {
    for (rr = 0; rr < 8; rr++) {
      dest[0] = cm[(int) source1[0] + (int) source2[0]];
      dest[1] = cm[(int) source1[1] + (int) source2[1]];
      dest[2] = cm[(int) source1[2] + (int) source2[2]];
      dest[3] = cm[(int) source1[3] + (int) source2[3]];
      dest[4] = cm[(int) source1[4] + (int) source2[4]];
      dest[5] = cm[(int) source1[5] + (int) source2[5]];
      dest[6] = cm[(int) source1[6] + (int) source2[6]];
      dest[7] = cm[(int) source1[7] + (int) source2[7]];
      dest += inc;
      source1 += inc;
      source2 += 8;
    }
  } else {
#ifdef INTEL
    int rr=8;
    // buggy

    asm volatile (
      ".align 32\n"
      "pxor      %%mm2 ,%%mm2\n" //    0    0    0    0    0    0    0    0
      "1:\n"
      "movq      (%0)  ,%%mm0\n" // s1_7 s1_6 s1_5 s1_4 s1_3 s1_2 s1_1 s1_0
      "movq      (%0)  ,%%mm4\n" // s1_7 s1_6 s1_5 s1_4 s1_3 s1_2 s1_1 s1_0
      "punpckhbw %%mm2 ,%%mm0\n" //   0 s1_7    0 s1_6    0 s1_5    0  s1_4
      "punpcklbw %%mm2 ,%%mm4\n" //   0 s1_3    0 s1_2    0 s1_1    0  s1_0
      "movq      (%1)  ,%%mm1\n" //   s23h s23l s22h s22l s21l s21h s20h s20l
      "movq      8(%1) ,%%mm5\n" //   s27h s27l s26h s26l s25l s25h s24h s24l
      "paddw    %%mm0 ,%%mm5\n"  //   mm4=mm4 + s3_0..3
      "paddw    %%mm4 ,%%mm1\n"  //   mm0=mm0 + s3_4..7
      "packuswb  %%mm5 ,%%mm1\n" //   cm[...]

      "movq      %%mm1 ,(%2)\n"  //   wrote out

      "leal      (%0,%3), %0\n"  //   source1+=inc
      "leal      16(%1) , %1\n"  //   source2+=inc
      "leal      (%2,%3), %2\n"  //   dest+=inc
      "decl      %4\n"
      "jnz       1b\n"
      //"emms\n"
      :
      : "r"(source1), "r"(source2), "r"(dest),"r"(inc),"r"(rr)
      );
#endif
  }

}

// Optimize me!
// should be mmx perfomance analysis shows: 13 % overall time
void CopyFunctions::copy8_div2_src3linear_crop(unsigned char* source1,
					       unsigned char* source2,
					       short int* source3,
					       unsigned char* dest,int inc) {
  int rr;
  if (lmmx==false) {
    for (rr = 0; rr < 8; rr++) {
      dest[0] = cm[((int) (source1[0] + source2[0]+1) >> 1) + source3[0]];
      dest[1] = cm[((int) (source1[1] + source2[1]+1) >> 1) + source3[1]];
      dest[2] = cm[((int) (source1[2] + source2[2]+1) >> 1) + source3[2]];
      dest[3] = cm[((int) (source1[3] + source2[3]+1) >> 1) + source3[3]];
      dest[4] = cm[((int) (source1[4] + source2[4]+1) >> 1) + source3[4]];
      dest[5] = cm[((int) (source1[5] + source2[5]+1) >> 1) + source3[5]];
      dest[6] = cm[((int) (source1[6] + source2[6]+1) >> 1) + source3[6]];
      dest[7] = cm[((int) (source1[7] + source2[7]+1) >> 1) + source3[7]];
      dest += inc;
      source1 += inc;
      source2 += inc;
      source3 += 8;
      
    }
  } else {
#ifdef INTEL
    // buggy
    int rr=8;

    asm volatile (
        "pxor      %%mm2 ,%%mm2\n" //   0    0    0    0    0    0    0    0
        "pxor      %%mm3 ,%%mm3\n" //   0    0    0    0    0    0    0    0
	"movq      ADDW_1,%%mm6\n" //   0    1    0    1    0    1    0    1
	"1:\n"		  
	"movq      (%0)  ,%%mm0\n" // s1_7 s1_6 s1_5 s1_4 s1_3 s1_2 s1_1 s1_0
	"movq      (%1)  ,%%mm1\n" // s2_7 s2_6 s2_5 s2_4 s2_3 s2_2 s2_1 s2_0
	"movq      %%mm0 ,%%mm4\n" // s1_7 s1_6 s1_5 s1_4 s1_3 s1_2 s1_1 s1_0
	"movq      %%mm1 ,%%mm5\n" // s2_7 s2_6 s2_5 s2_4 s2_3 s2_2 s2_1 s2_0
	"punpckhbw %%mm2 ,%%mm0\n" //   0 s1_7    0 s1_6    0 s1_5    0  s1_4
	"punpckhbw %%mm3 ,%%mm1\n" //   0 s2_7    0 s2_6    0 s2_5    0  s2_4
	"punpcklbw %%mm2 ,%%mm4\n" //   0 s1_3    0 s1_2    0 s1_1    0  s1_0
	"punpcklbw %%mm3 ,%%mm5\n" //   0 s2_3    0 s2_2    0 s2_1    0  s2_0
	"paddusw   %%mm4 ,%%mm5\n" //   mm5=s1_0..3 + s2_0..3
	"paddusw   %%mm0 ,%%mm1\n" //   mm1=s1_4..7 + s2_4..7
	"paddusw   %%mm6 ,%%mm5\n" //   mm5=mm5 + 1
	"paddusw   %%mm6 ,%%mm1\n" //   mm1=mm1 + 1
	"psraw     $1    ,%%mm1\n" //   mm1=mm1/2
	"psraw     $1    ,%%mm5\n" //   mm5=mm5/2
        "movq      (%2)  ,%%mm0\n" //   s33h s33l s32h s32l s31l s31h s30h s30l
        "movq      8(%2) ,%%mm4\n" //   s37h s37l s36h s36l s35l s35h s34h s34l
	"paddw     %%mm0 ,%%mm5\n" //   mm5=mm5 + s3_0..3
	"paddw     %%mm4 ,%%mm1\n" //   mm1=mm1 + s3_4..7
	"packuswb  %%mm1 ,%%mm5\n" //   cm[...]
        "movq      %%mm5 ,(%3)\n"  //   wrote out
	"leal      (%0,%4), %0\n"  //   source1+=inc
	"leal      (%1,%4), %1\n"  //   source2+=inc
	"addl      $16    ,%2 \n"  //   source3+8
	"leal      (%3,%4), %3\n"  //   dest+=inc
        "decl      %5\n"
	"jnz       1b\n"
	:
	: "r"(source1), "r"(source2), "r"(source3),
	  "r"(dest),"r"(inc),"r"(rr)
	);

#endif
  }


}


void CopyFunctions::copy8_div4_src5linear_crop(unsigned char* source1,
					       unsigned char* source2,
					       unsigned char* source3,
					       unsigned char* source4,
					       short int* source5,
					       unsigned char* dest,int inc) {

  int rr;

  for (rr = 0; rr < 8; rr++) {
    dest[0]=cm[((int) (source1[0]+source2[0]+source3[0]+source4[0]+2) >> 2) + source5[0]];
    dest[1]=cm[((int) (source1[1]+source2[1]+source3[1]+source4[1]+2) >> 2) + source5[1]];
    dest[2]=cm[((int) (source1[2]+source2[2]+source3[2]+source4[2]+2) >> 2) + source5[2]];
    dest[3]=cm[((int) (source1[3]+source2[3]+source3[3]+source4[3]+2) >> 2) + source5[3]];
    dest[4]=cm[((int) (source1[4]+source2[4]+source3[4]+source4[4]+2) >> 2) + source5[4]];
    dest[5]=cm[((int) (source1[5]+source2[5]+source3[5]+source4[5]+2) >> 2) + source5[5]];
    dest[6]=cm[((int) (source1[6]+source2[6]+source3[6]+source4[6]+2) >> 2) + source5[6]];
    dest[7]=cm[((int) (source1[7]+source2[7]+source3[7]+source4[7]+2) >> 2) + source5[7]];
    dest +=inc;
    source1 += inc;
    source2 += inc;
    source3 += inc;
    source4 += inc;
    source5 += 8;
  }
}
