#include <malloc.h>

#include "common.h"
#include "jive.h"


/* based on FS dither implementation from petern@bmtmicro.com - see bug 7462 */

#define C_WIDTH   4  /* Component width (e.g. 3 for RGB, 4 for RGBA) */
#define C0_INDEX  1  /* Index of first component (e.g. 0 for RGBA or RGB, 1 for ARGB) */
#define C1_INDEX  2  /* Index of second component (e.g. 1 for RGBA or RGB, 2 for ARGB) */
#define C2_INDEX  3  /* Index of third component (e.g. 2 for RGBA or RGB, 3 for ARGB) */

#define C0_BITS   5
#define C1_BITS   6
#define C2_BITS   5

#define C_MASK(bits) ((0xFF00>>bits)&0xFF)

#define DATA_ROW_BYTE_WIDTH(image_width) ((image_width*C_WIDTH+3)&~3) /* Row alignment, e.g. ((width+3)&3) for double word alignment */

/* Dither applies Floyd-Steinberg dithering on the image data. pInData
 * and pOutData can point to the same memory block
 */

void Dither (Uint8* pInData, Uint8* pOutData, int width, int height) {
	Uint8  clampTab[3 * 256]; 
	Sint32 errorClampTab[2 * 256];
	
	{
		int i;
		for (i = 0; i < 256; i++) {
			clampTab[i] = 0;                         
			clampTab[i + 256] = (Uint8)i;          
			clampTab[i + 2 * 256] = 255;
		}
	}
	
	{
		Sint32 clampVal = 0;
		int    clampIdx = 0;
		for (; clampIdx < 256 / 16; clampIdx++) {
			errorClampTab[clampIdx + 256] = clampVal; 
			errorClampTab[-clampIdx + 256] = -clampVal;
			clampVal++;
		}   
		for (; clampIdx < 3 * 256 / 16; clampIdx++) {
			errorClampTab[clampIdx + 256] = clampVal; 
			errorClampTab[-clampIdx + 256] = -clampVal;
			clampVal += clampIdx & 1;
		}
		for (; clampIdx < 256; clampIdx++) {
			errorClampTab[clampIdx + 256] = clampVal;
			errorClampTab[-clampIdx + 256] = -clampVal;
		}
	}

   {
	   Sint32* floydSteinbergError = (Sint32*)calloc ((width + 2) * 3, sizeof (Sint32)); /* Memory must be cleared to zero! Use calloc or clear buffer with code! */
	   int   dirC = C_WIDTH;
	   int   dirE = 3;
	   int   row  = height;
	   
	   while (row--) {
		   Uint8* pIn  = pInData  + DATA_ROW_BYTE_WIDTH (width) * row;
		   Uint8* pOut = pOutData + DATA_ROW_BYTE_WIDTH (width) * row;
		   Sint32* pError = floydSteinbergError;
		   
		   Sint32 c0 = 0;
		   Sint32 c1 = 0;
		   Sint32 c2 = 0;
		   Sint32 errBelow0 = 0;
		   Sint32 errBelow1 = 0;
		   Sint32 errBelow2 = 0;
		   Sint32 errPrev0 = 0;
		   Sint32 errPrev1 = 0;
		   Sint32 errPrev2 = 0;
		   int  col = width;
		   
		   if (dirC < 0) {
			   pIn    += C_WIDTH * (width - 1);
			   pOut   += C_WIDTH * (width - 1);
			   pError += 3 * (width + 1);
		   }
		   
		   while (col--) {
			   c0 = (c0 + pError[dirC + 0] + 8) >> 4;
			   c1 = (c1 + pError[dirC + 1] + 8) >> 4;
			   c2 = (c2 + pError[dirC + 2] + 8) >> 4;
			   c0 = errorClampTab[c0 + 256];
			   c1 = errorClampTab[c1 + 256];
			   c2 = errorClampTab[c2 + 256];
			   c0 += pIn[C0_INDEX];  
			   c1 += pIn[C1_INDEX];
			   c2 += pIn[C2_INDEX];
			   c0 = clampTab[c0 + 256];
			   c1 = clampTab[c1 + 256];
			   c2 = clampTab[c2 + 256];
			   
			   c0 -= (pOut[C0_INDEX] = (c0 & C_MASK(C0_BITS)));
			   c1 -= (pOut[C1_INDEX] = (c1 & C_MASK(C1_BITS)));
			   c2 -= (pOut[C2_INDEX] = (c2 & C_MASK(C2_BITS)));
			   
			   {
				   Sint32 errNext = c0;         /* Process component 0 */
				   Sint32 delta = c0 * 2;
				   c0 += delta;                 /* form error * 3 */
				   pError[0] = errPrev0 + c0;
				   c0 += delta;                 /* form error * 5 */
				   errPrev0 = errBelow0 + c0;
				   errBelow0 = errNext;
				   c0 += delta;                 /* form error * 7 */
			   }
			   
			   {
				   Sint32 errNext = c1;         /* Process component 1 */
				   Sint32 delta = c1 * 2;
				   c1 += delta;                 /* form error * 3 */
				   pError[1] = errPrev1 + c1;   
				   c1 += delta;                 /* form error * 5 */
				   errPrev1 = errBelow1 + c1;
				   errBelow1 = errNext;
				   c1 += delta;                 /* form error * 7 */
			   }
			   
			   {
				   Sint32 errNext = c2;         /* Process component 2 */
				   Sint32 delta = c2 * 2;
				   c2 += delta;                 /* form error * 3 */
				   pError[2] = errPrev2 + c2;
				   c2 += delta;                 /* form error * 5 */
				   errPrev2 = errBelow2 + c2;
				   errBelow2 = errNext;
				   c2 += delta;                 /* form error * 7 */
			   }
			   
			   pIn    += dirC;
			   pOut   += dirC;
			   pError += dirE;
		   }
		   pError[0] = errPrev0;
		   pError[1] = errPrev1;
		   pError[2] = errPrev2;
		   
		   dirC = -dirC;
		   dirE = -dirE;
	   }
	   
	   free ((void*)floydSteinbergError);
   }
}


/* Truncate cuts off bits to create image data that simulates the
 * truncation done by the display */

void Truncate (Uint8* pInData, Uint8* pOutData, int width, int height) {
	int row  = height;
	while (row--) {
		Uint8* pIn  = pInData  + DATA_ROW_BYTE_WIDTH (width) * row;
		Uint8* pOut = pOutData + DATA_ROW_BYTE_WIDTH (width) * row;
		int col = width;
		while (col--) {          
			pOut[C0_INDEX] = pIn[C0_INDEX] & C_MASK(C0_BITS);
			pOut[C1_INDEX] = pIn[C1_INDEX] & C_MASK(C1_BITS);
			pOut[C2_INDEX] = pIn[C2_INDEX] & C_MASK(C2_BITS);
			pIn  += C_WIDTH;
			pOut += C_WIDTH;
		}
	}
}   


int jiveL_quantize565 (lua_State *L) {

	/* stack is:
		 1: surface
		 2: dodither - boolean to enable/disable dithering
	*/

	JiveSurface *srf = tolua_tousertype(L, 1, 0);
	SDL_Surface *sdl = srf->sdl;
	int dodither = lua_toboolean(L, 2);
	Uint32 w, h;
	Uint8 *start;
	
	w = sdl->w;
	h = sdl->h;
	start = sdl->pixels;

	Uint32 t0 = SDL_GetTicks();

	if (!dodither) {

		Truncate(start, start, w, h);

	} else {

		Dither(start, start, w, h);
	}

	Uint32 t1 = SDL_GetTicks();

	printf("quantize565 took %dms\n", t1-t0);

	return 0;
}


static const struct luaL_Reg ditherlib [] = {
	{ "quantize565", jiveL_quantize565 },
 	{ NULL, NULL }
};


int luaopen_jive_slim_dither (lua_State *L) {
	luaL_register(L, "dither", ditherlib);
	return 1;
}
