/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**************************** mpeg encoder ***********************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "fame.h"
#include "fame_encoder.h"
#include "fame_encoder_mpeg.h"
#include "table_scale.h"
#if defined(HAS_MMX)
#define arch_enter_state()
#define arch_leave_state() asm("emms")
#include "transpose_mmx.h"
#include "dct_mmx.h"
#include "quantize_mmx.h"
#include "fetch_mmx.h"
#else
#define arch_enter_state() 
#define arch_leave_state() 
#include "dct_float.h"
#include "quantize_float.h"
#include "fetch_float.h"
#endif

#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ <= 95 && __GNUC_PATCHLEVEL__ <= 3)
/* gcc bug?? workaround */
extern void __fame_dummy_call(int q);
#endif

static void mpeg_init(fame_encoder_t *encoder,
		      int width,
		      int height,
		      unsigned char *intra_quantisation_table,
		      unsigned char *inter_quantisation_table,
		      unsigned char *intra_dc_y_scale_table,
		      unsigned char *intra_dc_c_scale_table,
		      fame_mismatch_t mismatch_type);
static void mpeg_enter(fame_encoder_t *encoder,
			fame_yuv_t **past_ref,
			fame_yuv_t **new_ref,
			fame_yuv_t **future_ref,
			fame_yuv_t *yuv,
			unsigned char *shape);
static void mpeg_encode_intra_mb(fame_encoder_t *encoder,
				 short x,
				 short y,
				 short *blocks[6],
				 unsigned char q,
				 fame_bab_t bab_type);
static void mpeg_encode_inter_mb(fame_encoder_t *encoder,
				 short x,
				 short y,
				 short *blocks[6],
				 fame_motion_vector_t *forward,
				 fame_motion_vector_t *backward,
				 fame_motion_coding_t motion_coding,
				 unsigned char q,
				 fame_bab_t bab_type);
static void mpeg_leave(fame_encoder_t *encoder);
static void mpeg_close(fame_encoder_t *encoder);

FAME_CONSTRUCTOR(fame_encoder_mpeg_t)
{
  FAME_OBJECT(this)->name = "MPEG encoder";
  FAME_ENCODER(this)->init = mpeg_init;
  FAME_ENCODER(this)->enter = mpeg_enter;
  FAME_ENCODER(this)->encode_intra_mb = mpeg_encode_intra_mb;
  FAME_ENCODER(this)->encode_inter_mb = mpeg_encode_inter_mb;
  FAME_ENCODER(this)->leave = mpeg_leave;
  FAME_ENCODER(this)->close = mpeg_close;
  return(this);
}

/*  mpeg_init                                                                */
/*                                                                           */
/*  Description:                                                             */
/*    Initialize the encoder.                                                */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder to initialize                     */
/*    int width: width of the frame                                          */
/*    int height: height of the frame                                        */
/*    unsigned char *intra_quantisation_table: quantisation matrix for intra */
/*    unsigned char *inter_quantisation_table: quantisation matrix for inter */
/*    unsigned char *intra_dc_y_scale_table: quantisation table for DC of Y  */
/*    unsigned char *intra_dc_c_scale_table: quantisation table for DC of C  */
/*    fame_mismatch_t mismatch_type: type of mismatch control                */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */

static void mpeg_init(fame_encoder_t *encoder,
		      int width,
		      int height,
		      unsigned char *iqtable,
		      unsigned char *niqtable,
		      unsigned char *intra_dc_y_scale_table,
		      unsigned char *intra_dc_c_scale_table,
		      fame_mismatch_t mismatch_type)
{
  fame_encoder_mpeg_t *encoder_mpeg = FAME_ENCODER_MPEG(encoder);
  int i, q;

  /* set width and height */
  encoder_mpeg->width = width;
  encoder_mpeg->height = height;

  /* allocate padded shape buffer */
  encoder_mpeg->padded = (unsigned char *) malloc(encoder_mpeg->width*
						   encoder_mpeg->height);
  encoder_mpeg->mismatch = mismatch_type;

  /* compute quantization matrixes */
  for(q = 1; q < 32; q++) {
    /* compute the intra quantisation and dequantisation DC scaler */
#ifdef HAS_MMX
    asm("emms");
    encoder_mpeg->yiqmatrixes[q][0] = 
      (dct_t) ((double)(1UL<<16)*postscale[0]/intra_dc_y_scale_table[q]);
    encoder_mpeg->ciqmatrixes[q][0] =
      (dct_t) ((double)(1UL<<16)*postscale[0]/intra_dc_c_scale_table[q]);
    encoder_mpeg->yiqround[q][0] = 
      (dct_t) ((double)intra_dc_y_scale_table[q]/(2*postscale[0])+0.5);
    encoder_mpeg->ciqround[q][0] = 
      (dct_t) ((double)intra_dc_c_scale_table[q]/(2*postscale[0])+0.5);
#else
    encoder_mpeg->yiqmatrixes[q][0] = postscale[0] / intra_dc_y_scale_table[q];
    encoder_mpeg->ciqmatrixes[q][0] = postscale[0] / intra_dc_c_scale_table[q];
    encoder_mpeg->yiqround[q][0] = ((dct_t) intra_dc_y_scale_table[q])/(2*postscale[0]);
    encoder_mpeg->ciqround[q][0] = ((dct_t) intra_dc_c_scale_table[q])/(2*postscale[0]);
#endif

    /* compute the intra quantisation and dequantisation matrix */
    for(i = 1; i < 64; i++)
    {
#ifdef HAS_MMX
#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ <= 95 && __GNUC_PATCHLEVEL__ <= 3)
//#error Your GCC is too old, and may produce bad code for libfame.
      /* gcc bug here?? try to comment/uncomment the following line*/
      /* or was I wrong in some earlier asm directive??! */
      /* force unoptimized access to q */
      __fame_dummy_call(q);
#endif
      encoder_mpeg->yiqmatrixes[q][i] = encoder_mpeg->ciqmatrixes[q][i] =
	(dct_t) ((double)(1UL<<19)*postscale[i] / (q*iqtable[i]));
      encoder_mpeg->yiqround[q][i] = encoder_mpeg->ciqround[q][i] = 
	(dct_t) ((double)((1+(6*q+3)/4) * iqtable[i]) / (4 * 8.0 * postscale[i]) + 0.5);
#else
      encoder_mpeg->yiqmatrixes[q][i] = encoder_mpeg->ciqmatrixes[q][i] =
	8.0 * postscale[i] / (q * iqtable[i]);
      /* mpeg-4 rounding gives better rate-distortion results */
      /* than mpeg-1 except maybe for q = 1 (need more tests) */
      encoder_mpeg->yiqround[q][i] = encoder_mpeg->ciqround[q][i] = 
	((dct_t) (1+(6*q+3)/4) * iqtable[i]) / (4 * 8.0 * postscale[i]);
#endif
    }

    /* compute the inter quantisation and dequantisation matrix */
    for(i = 0; i < 64; i++)
    {
#ifdef HAS_MMX
      encoder_mpeg->niqmatrixes[q][i] =
	(dct_t) ((double)(1UL<<19)*postscale[i]/(q*niqtable[i]));
      encoder_mpeg->niqround[q][i] =
	(dct_t) ((double)niqtable[i] / (4 * 8.0 * postscale[i]) + 0.5);

#else
      encoder_mpeg->niqmatrixes[q][i] =
	8.0 * postscale[i] / (q * niqtable[i]);
      encoder_mpeg->niqround[q][i] = 
	(dct_t) niqtable[i] / (4 * 8.0 * postscale[i]);
#endif
    }		     
  }
}

/*  mpeg_enter                                                               */
/*                                                                           */
/*  Description:                                                             */
/*    Start encoding a new picture.                                          */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*    fame_yuv_t **past_ref: past reference images                           */
/*    fame_yuv_t **new_ref: new reconstructed reference images               */
/*    fame_yuv_t **future_ref: future reference images                       */
/*    fame_yuv_t *yuv: source image                                          */
/*    unsigned char *shape: shape binary mask                                */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */
  
static void mpeg_enter(fame_encoder_t *encoder,
			fame_yuv_t **past_ref,
			fame_yuv_t **new_ref,
			fame_yuv_t **future_ref,
			fame_yuv_t *yuv,
			unsigned char *shape)
{
  fame_encoder_mpeg_t *encoder_mpeg = FAME_ENCODER_MPEG(encoder);

  /* Make pointers on the input frame and reference frame */
  encoder_mpeg->input = yuv;
  encoder_mpeg->past_ref = past_ref;
  encoder_mpeg->new_ref = new_ref;
  encoder_mpeg->future_ref = future_ref;
  encoder_mpeg->shape = shape;
  arch_enter_state();
}
  
/*  mpeg_encode_intra_mb                                                    */
/*                                                                           */
/*  Description:                                                             */
/*    Encode an intra macroblock.                                            */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*    bitbuffer_t *bb: a bit buffer to write the resulting encoded data to.  */
/*    short x: the x location of the macroblock in macroblock units          */
/*    short y: the y location of the macroblock in macroblock units          */
/*    short *blocks[6]: the DCT coded blocks                                 */
/*    unsigned char q: the quantizer scale for this block                    */
/*    fame_bab_t bab_type: binary alpha block type                           */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */

static void mpeg_encode_intra_mb(fame_encoder_t *encoder,
				 short x,
				 short y,
				 short *blocks[6],
				 unsigned char q,
				 fame_bab_t bab_type)
{
  fame_encoder_mpeg_t *encoder_mpeg = FAME_ENCODER_MPEG(encoder);
  unsigned long offset0, offset1, offset2, offset3, offset4, offset5;
  int i, pitch;
  void (* prefetch_Y)(unsigned char *input,
		      dct_t *output,
		      unsigned char *shape,
		      int pitch);
  void (* prefetch_C)(unsigned char *input,
		      dct_t *output,
		      unsigned char *shape,
		      int pitch);
  void (* dct_)(dct_t *block);
  void (* quantize_)(short *block, dct_t *qblock, dct_t *matrix, dct_t *round);

  pitch = encoder_mpeg->input->p;

  /* Make offsets to blocks */
  offset0 = (y << 4) * pitch + (x << 4);         /* Y(0,0) */
  offset1 = offset0 + 8;                       /* Y(0,1) */
  offset2 = offset0 + (pitch << 3);            /* Y(1,0) */
  offset3 = offset2 + 8;                       /* Y(1,1) */
  offset4 = (y << 3) * (pitch >> 1) + (x << 3);  /* Cb     */
  offset5 = (y << 3) * (pitch >> 1) + (x << 3);  /* Cr     */

  /* Encode blocks */
  for(i = 0; i < 6; i++)
    blocks[i] = encoder_mpeg->blocks[i];

  if(bab_type != bab_all_coded)
  {
    prefetch_Y = prefetch_Y_withmask;
    prefetch_C = prefetch_C_withmask;
  }
  else 
  {
    prefetch_Y = prefetch_withoutmask;
    prefetch_C = prefetch_withoutmask;
  }
  dct_ = dct;
  quantize_ = quantize;

  /* Y (0,0) */
  prefetch_Y(encoder_mpeg->input->y + offset0,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset0,
	     pitch);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[0],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->yiqmatrixes[q],
	    encoder_mpeg->yiqround[q]);

  /* Y (0,1) */
  prefetch_Y(encoder_mpeg->input->y + offset1,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset1,
	     pitch);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[1],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->yiqmatrixes[q],
	    encoder_mpeg->yiqround[q]);

  /* Y (1,0) */
  prefetch_Y(encoder_mpeg->input->y + offset2,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset2,
	     pitch);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[2],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->yiqmatrixes[q],
	    encoder_mpeg->yiqround[q]);

  /* Y (1,1) */
  prefetch_Y(encoder_mpeg->input->y + offset3,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset3,
	     pitch);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[3],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->yiqmatrixes[q],
	    encoder_mpeg->yiqround[q]);
  
  /* U */
  prefetch_C(encoder_mpeg->input->u + offset4,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset0, /* top left corner of mb */
	     pitch >> 1);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[4],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->ciqmatrixes[q],
	    encoder_mpeg->ciqround[q]);

  /* V */
  prefetch_C(encoder_mpeg->input->v + offset5,
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->shape + offset0, /* top left corner of mb */
	     pitch >> 1);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[5],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->ciqmatrixes[q],
	    encoder_mpeg->ciqround[q]);
}

/*  mpeg_encode_inter_mb                                                    */
/*                                                                           */
/*  Description:                                                             */
/*    Encode an inter macroblock.                                            */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*    bitbuffer_t *bb: a bit buffer to write the resulting encoded data to.  */
/*    short x: the x location of the macroblock in macroblock units          */
/*    short y: the y location of the macroblock in macroblock units          */
/*    short *blocks[6]: the DCT coded blocks                                 */
/*    fame_bab_t bab_type: binary alpha block type                           */
/*    fame_motion_vector_t *forward: forward motion vectors                  */
/*    fame_motion_vector_t *backward: backward motion vectors                */
/*    unsigned char q: the quantizer scale for this block                    */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */
  
static void mpeg_encode_inter_mb(fame_encoder_t *encoder,
				 short x,
				 short y,
				 short *blocks[6],
				 fame_motion_vector_t *forward,
				 fame_motion_vector_t *backward,
				 fame_motion_coding_t motion_coding,
				 unsigned char q,
				 fame_bab_t bab_type)
{
  fame_encoder_mpeg_t *encoder_mpeg = FAME_ENCODER_MPEG(encoder);
  unsigned long offset0, offset1, offset2, offset3, offset4, offset5;
  signed long motion0, motion1, motion2, motion3, motion4, motion5;
  signed long residual0, residual1, residual2, residual3, residual4, residual5;
  int i, pitch;
  void (* diff_)(unsigned char *input,
		 unsigned char *ref,
		 dct_t *output,
		 int ipitch,
		 int rpitch);
  void (* dct_)(dct_t *block);
  void (* quantize_)(short *block, dct_t *qblock, dct_t *matrix, dct_t *round);

  /* Make offsets to blocks */
  pitch = encoder_mpeg->input->p;
  offset0 = (y << 4) * pitch + (x << 4);         /* Y(0,0) */
  offset1 = offset0 + 8;                       /* Y(0,1) */
  offset2 = offset0 + (pitch << 3);            /* Y(1,0) */
  offset3 = offset2 + 8;                       /* Y(1,1) */
  offset4 = (y << 3) * (pitch >> 1) + (x << 3);  /* Cb     */
  offset5 = (y << 3) * (pitch >> 1) + (x << 3);  /* Cr     */

  /* Compute motion offsets (motion is half-pixel coded) */
  /* half-pel motion */
  residual0 = ((forward[0].dy & 1) << 1) | (forward[0].dx & 1);
  residual1 = ((forward[1].dy & 1) << 1) | (forward[1].dx & 1);
  residual2 = ((forward[2].dy & 1) << 1) | (forward[2].dx & 1);
  residual3 = ((forward[3].dy & 1) << 1) | (forward[3].dx & 1);
  residual4 = ((forward[4].dy & 1) << 1) | (forward[4].dx & 1);
  residual5 = ((forward[5].dy & 1) << 1) | (forward[5].dx & 1);
  /* full-pel motion */
  pitch = encoder_mpeg->future_ref[residual0]->p;
  motion0 = ((y<<4)+(forward[0].dy>>1)  )*pitch+(forward[0].dx>>1)+(x<<4)  ;
  pitch = encoder_mpeg->future_ref[residual1]->p;
  motion1 = ((y<<4)+(forward[1].dy>>1)  )*pitch+(forward[1].dx>>1)+(x<<4)+8;
  pitch = encoder_mpeg->future_ref[residual2]->p;
  motion2 = ((y<<4)+(forward[2].dy>>1)+8)*pitch+(forward[2].dx>>1)+(x<<4)  ;
  pitch = encoder_mpeg->future_ref[residual3]->p;
  motion3 = ((y<<4)+(forward[3].dy>>1)+8)*pitch+(forward[3].dx>>1)+(x<<4)+8;
  pitch = encoder_mpeg->future_ref[residual4]->p;
  motion4 = ((y<<3)+(forward[4].dy>>1))*(pitch>>1)+(forward[4].dx>>1)+(x<<3);
  pitch = encoder_mpeg->future_ref[residual5]->p;
  motion5 = ((y<<3)+(forward[5].dy>>1))*(pitch>>1)+(forward[5].dx>>1)+(x<<3);

  /* Encode blocks */
  pitch = encoder_mpeg->input->p;
  for(i = 0; i < 6; i++)
    blocks[i] = encoder_mpeg->blocks[i];

  diff_ = diff;
  dct_ = dct;
  quantize_ = quantize;

  /* Y */
  if(forward[0].error < encoder_mpeg->quant_scale*16)
    blocks[0] = NULL;
  else {
    diff_(encoder_mpeg->input->y + offset0,
	  encoder_mpeg->future_ref[residual0]->y + motion0,
	  encoder_mpeg->tmpblock,
	  pitch,
	  pitch+32);
    dct_(encoder_mpeg->tmpblock);
    quantize_(encoder_mpeg->blocks[0],
	     encoder_mpeg->tmpblock,
	     encoder_mpeg->niqmatrixes[q],
	     encoder_mpeg->niqround[q]);
  }

  if(forward[1].error < encoder_mpeg->quant_scale*16)
    blocks[1] = NULL;
  else {
    diff_(encoder_mpeg->input->y + offset1,
	  encoder_mpeg->future_ref[residual1]->y + motion1,
	  encoder_mpeg->tmpblock,
	  pitch,
	  pitch+32);

    dct_(encoder_mpeg->tmpblock);
    quantize_(encoder_mpeg->blocks[1],
	      encoder_mpeg->tmpblock,
	      encoder_mpeg->niqmatrixes[q],
	      encoder_mpeg->niqround[q]);
  }

  if(forward[2].error < encoder_mpeg->quant_scale*16)
    blocks[2] = NULL;
  else {
    diff_(encoder_mpeg->input->y + offset2,
	  encoder_mpeg->future_ref[residual2]->y + motion2,
	  encoder_mpeg->tmpblock,
	  pitch,
	  pitch+32);
    dct_(encoder_mpeg->tmpblock);
    quantize_(encoder_mpeg->blocks[2],
	      encoder_mpeg->tmpblock,
	      encoder_mpeg->niqmatrixes[q],
	      encoder_mpeg->niqround[q]);
  }

  if(forward[3].error < encoder_mpeg->quant_scale*16)
    blocks[3] = NULL;
  else  {
    diff_(encoder_mpeg->input->y + offset3,
	  encoder_mpeg->future_ref[residual3]->y + motion3,
	  encoder_mpeg->tmpblock,
	  pitch,
	  pitch+32);
    dct_(encoder_mpeg->tmpblock);
    quantize_(encoder_mpeg->blocks[3],
	      encoder_mpeg->tmpblock,
	      encoder_mpeg->niqmatrixes[q],
	      encoder_mpeg->niqround[q]);
  }

  /* U */
  /* TODO: skip block with error < quant_scale*16 */
  diff_(encoder_mpeg->input->u + offset4,
	encoder_mpeg->future_ref[residual4]->u + motion4,
	encoder_mpeg->tmpblock,
	pitch >> 1,
	(pitch+32) >> 1);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[4],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->niqmatrixes[q],
	    encoder_mpeg->niqround[q]);

  /* V */
  /* TODO: skip block with error < quant_scale*16 */
  diff_(encoder_mpeg->input->v + offset5,
	encoder_mpeg->future_ref[residual5]->v + motion5,
	encoder_mpeg->tmpblock,
	pitch >> 1,
	(pitch+32) >> 1);
  dct_(encoder_mpeg->tmpblock);
  quantize_(encoder_mpeg->blocks[5],
	    encoder_mpeg->tmpblock,
	    encoder_mpeg->niqmatrixes[q],
	    encoder_mpeg->niqround[q]);
}

/*  mpeg_leave                                                              */
/*                                                                           */
/*  Description:                                                             */
/*    End the encoding of a picture.                                         */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */

static void mpeg_leave(fame_encoder_t *encoder)
{
  arch_leave_state();
}

/*  mpeg_close                                                              */
/*                                                                           */
/*  Description:                                                             */
/*    Release the encoder.                                                   */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */

static void mpeg_close(fame_encoder_t *encoder)
{
  fame_encoder_mpeg_t *encoder_mpeg = FAME_ENCODER_MPEG(encoder);

  /* free shape padding buffer */
  free(encoder_mpeg->padded);
}
