/*
 * This file is part of the Advance project.
 *
 * Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004 Andrea Mazzoleni
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * In addition, as a special exception, Andrea Mazzoleni
 * gives permission to link the code of this program with
 * the MAME library (or with modified versions of MAME that use the
 * same license as MAME), and distribute linked combinations including
 * the two.  You must obey the GNU General Public License in all
 * respects for all of the code used other than MAME.  If you modify
 * this file, you may extend this exception to your version of the
 * file, but you are not obligated to do so.  If you do not wish to
 * do so, delete this exception statement from your version.
 */

#ifndef __IMEAN_H
#define __IMEAN_H

#include "icommon.h"

/***************************************************************************/
/* internal_mean */

enum MEAN_MASK {
	MEAN_MASK_H_0,
	MEAN_MASK_H_1,
	MEAN_MASK_MAX
};

static uint32 mean_mask[MEAN_MASK_MAX];

static uint32 expand_nibble(unsigned bytes_per_pixel, unsigned v)
{
	switch (bytes_per_pixel) {
	case 1 :
		return v | v << 8 | v << 16 | v << 24;
	case 2 :
		return v | v << 16;
	case 4 :
		return v;
	default:
		return 0;
	}
}

static void internal_mean_set(const struct video_pipeline_target_struct* target)
{
	adv_pixel rgb_h = rgb_highmask_make_from_def(target->color_def);
	adv_pixel rgb_m = rgb_wholemask_make_from_def(target->color_def);

	unsigned bytes_per_pixel = target->bytes_per_pixel;

	mean_mask[MEAN_MASK_H_0] = expand_nibble(bytes_per_pixel, (~rgb_h) & rgb_m);
	mean_mask[MEAN_MASK_H_1] = mean_mask[MEAN_MASK_H_0];
}

/**
 * Compute the mean of two series of rgb pixels.
 * This function compute (a + b) / 2 for any rgb nibble, using the
 * the formula (a + b) / 2 = ((a ^ b) >> 1) + (a & b).
 * To extend this formula to a serie of packed nibbles the formula is
 * implemented as (((v0 ^ v1) >> 1) & MASK) + (v0 & v1) where MASK
 * is used to clear the high bit of all the packed nibbles.
 */
static inline uint32 internal_mean_value(uint32 v0, uint32 v1)
{
	return (((v0 ^ v1) >> 1) & mean_mask[MEAN_MASK_H_0]) + (v0 & v1);
}

/***************************************************************************/
/* Compute the mean of dst and src and store the result in dst */

#if defined(USE_ASM_INLINE)
static inline void internal_mean64_vert_self_mmx(void* dst, const void* src, unsigned count)
{
	assert_align(((unsigned)src & 0x7)==0 && ((unsigned)dst & 0x7)==0);

	__asm__ __volatile__(
		"movq (%3), %%mm4\n"
		ASM_JUMP_ALIGN
		"0:\n"
		"movq (%0), %%mm0\n"
		"movq (%1), %%mm1\n"
		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		"addl $8, %0\n"
		"addl $8, %1\n"
		"decl %2\n"
		"jnz 0b\n"
		: "+S" (src), "+D" (dst), "+c" (count)
		: "r" (mean_mask)
		: "cc"
	);
}

static inline void internal_mean8_vert_self_mmx(uint8* dst, const uint8* src, unsigned count)
{
	internal_mean64_vert_self_mmx(dst, src, count / 8);
}

static inline void internal_mean16_vert_self_mmx(uint16* dst, const uint16* src, unsigned count)
{
	internal_mean64_vert_self_mmx(dst, src, count / 4);
}

static inline void internal_mean32_vert_self_mmx(uint32* dst, const uint32* src, unsigned count)
{
	internal_mean64_vert_self_mmx(dst, src, count / 2);
}
#endif

static inline void internal_mean32_vert_self_def(uint32* dst32, const uint32* src32, unsigned count)
{
	while (count) {
		dst32[0] = internal_mean_value(dst32[0], src32[0]);
		++src32;
		++dst32;
		--count;
	}
}

static inline void internal_mean8_vert_self_def(uint8* dst, const uint8* src, unsigned count)
{
	internal_mean32_vert_self_def((uint32*)dst, (uint32*)src, count / 4);
}

static inline void internal_mean16_vert_self_def(uint16* dst, const uint16* src, unsigned count)
{
	internal_mean32_vert_self_def((uint32*)dst, (uint32*)src, count / 2);
}

static inline void internal_mean8_vert_self_step(uint8* dst8, const uint8* src8, unsigned count, int step1)
{
	while (count) {
		dst8[0] = internal_mean_value(dst8[0], src8[0]);
		src8 += step1;
		++dst8;
		--count;
	}
}

static inline void internal_mean16_vert_self_step(uint16* dst16, const uint16* src16, unsigned count, int step1)
{
	while (count) {
		dst16[0] = internal_mean_value(dst16[0], src16[0]);
		PADD(src16, step1);
		++dst16;
		--count;
	}
}

static inline void internal_mean32_vert_self_step(uint32* dst32, const uint32* src32, unsigned count, int step1)
{
	while (count) {
		dst32[0] = internal_mean_value(dst32[0], src32[0]);
		PADD(src32, step1);
		++dst32;
		--count;
	}
}

/***************************************************************************/
/* Compute the mean of src and src+1 and store the result in dst */

#if defined(USE_ASM_INLINE)
static uint32 mean8_horz_step1_mask[2] = { 0x00000000, 0xFF000000 };

static inline void internal_mean8_horz_next_step1_mmx(uint8* dst, const uint8* src, unsigned count)
{
	assert_align(((unsigned)src & 0x7)==0 && ((unsigned)dst & 0x7)==0);

	count /= 8;
	if (!count)
		return;

	__asm__ __volatile__(
		"decl %2\n"
		"jz 1f\n"
		"movq (%3), %%mm4\n"
		"movq (%0), %%mm7\n" /* previous value */

		ASM_JUMP_ALIGN
		"0:\n"
		"movq %%mm7, %%mm0\n" /* current value */
		"movq 8(%0), %%mm1\n"
		"movq %%mm0, %%mm6\n"
		"movq %%mm1, %%mm7\n" /* next value */
		"psrlq $8, %%mm6\n"
		"psllq $56, %%mm1\n"
		"por %%mm6, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		"addl $8, %0\n"
		"addl $8, %1\n"
		"decl %2\n"
		"jnz 0b\n"

		"1:"

		"movq %%mm7, %%mm0\n" /* current value */
		"movq %%mm7, %%mm1\n"
		"psrlq $8, %%mm7\n"
		"pand (%4), %%mm1\n"
		"por %%mm7, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		: "+S" (src), "+D" (dst), "+c" (count)
		: "r" (mean_mask), "r" (mean8_horz_step1_mask)
		: "cc"
	);
}

static uint32 mean16_horz_step2_mask[2] = { 0x00000000, 0xFFFF0000 };

static inline void internal_mean16_horz_next_step2_mmx(uint16* dst, const uint16* src, unsigned count)
{
	assert_align(((unsigned)src & 0x7)==0 && ((unsigned)dst & 0x7)==0);

	count /= 4;
	if (!count)
		return;

	__asm__ __volatile__(
		"decl %2\n"
		"jz 1f\n"
		"movq (%3), %%mm4\n"
		"movq (%0), %%mm7\n" /* previous value */

		ASM_JUMP_ALIGN
		"0:\n"
		"movq %%mm7, %%mm0\n" /* current value */
		"movq 8(%0), %%mm1\n"
		"movq %%mm0, %%mm6\n"
		"movq %%mm1, %%mm7\n" /* next value */
		"psrlq $16, %%mm6\n"
		"psllq $48, %%mm1\n"
		"por %%mm6, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"
		"addl $8, %0\n"
		"addl $8, %1\n"
		"decl %2\n"
		"jnz 0b\n"

		"1:"

		"movq %%mm7, %%mm0\n" /* current value */
		"movq %%mm7, %%mm1\n"
		"psrlq $16, %%mm7\n"
		"pand (%4), %%mm1\n"
		"por %%mm7, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		: "+S" (src), "+D" (dst), "+c" (count)
		: "r" (mean_mask), "r" (mean16_horz_step2_mask)
		: "cc"
	);
}

static uint32 mean32_horz_step4_mask[2] = { 0x00000000, 0xFFFFFFFF };

static inline void internal_mean32_horz_next_step4_mmx(uint32* dst, const uint32* src, unsigned count)
{
	assert_align(((unsigned)src & 0x7)==0 && ((unsigned)dst & 0x7)==0);

	count /= 2;
	if (!count)
		return;

	__asm__ __volatile__(
		"decl %2\n"
		"jz 1f\n"
		"movq (%3), %%mm4\n"
		"movq (%0), %%mm7\n" /* previous value */

		ASM_JUMP_ALIGN
		"0:\n"
		"movq %%mm7, %%mm0\n" /* current value */
		"movq 8(%0), %%mm1\n"
		"movq %%mm0, %%mm6\n"
		"movq %%mm1, %%mm7\n" /* next value */
		"psrlq $32, %%mm6\n"
		"psllq $32, %%mm1\n"
		"por %%mm6, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		"addl $8, %0\n"
		"addl $8, %1\n"
		"decl %2\n"
		"jnz 0b\n"

		"1:"

		"movq %%mm7, %%mm0\n" /* current value */
		"movq %%mm7, %%mm1\n"
		"psrlq $32, %%mm7\n"
		"pand (%4), %%mm1\n"
		"por %%mm7, %%mm1\n" /* masked value */

		"movq %%mm0, %%mm2\n"
		"movq %%mm1, %%mm3\n"
		"pxor %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"psrlq $1, %%mm0\n"
		"pand %%mm4, %%mm0\n"
		"paddd %%mm2, %%mm0\n"
		"movq %%mm0, (%1)\n"

		: "+S" (src), "+D" (dst), "+c" (count)
		: "r" (mean_mask), "r" (mean32_horz_step4_mask)
		: "cc"
	);
}
#endif

static inline void internal_mean8_horz_next_step1_def(uint8* dst8, const uint8* src8, unsigned count)
{
	count /= 4;
	if (count) {
		const uint32* src32 = (const uint32*)src8;
		uint32* dst32 = (uint32*)dst8;
		--count;
		while (count) {
#ifdef USE_LSB
			*dst32 = internal_mean_value(src32[0], (src32[0] >> 8) | (src32[1] << 24));
#else
			*dst32 = internal_mean_value(src32[0], (src32[0] << 8) | (src32[1] >> 24));
#endif
			++dst32;
			++src32;
			--count;
		}
#ifdef USE_LSB
		*dst32 = internal_mean_value(src32[0], (src32[0] >> 8) | (src32[0] & 0xFF000000));
#else
		*dst32 = internal_mean_value(src32[0], (src32[0] << 8) | (src32[0] & 0x000000FF));
#endif
	}
}

static inline void internal_mean16_horz_next_step2_def(uint16* dst16, const uint16* src16, unsigned count)
{
	count /= 2;
	if (count) {
		const uint32* src32 = (uint32*)src16;
		uint32* dst32 = (uint32*)dst16;
		--count;
		while (count) {
#ifdef USE_LSB
			*dst32 = internal_mean_value(src32[0], (src32[0] >> 16) | (src32[1] << 16));
#else
			*dst32 = internal_mean_value(src32[0], (src32[0] << 16) | (src32[1] >> 16));
#endif
			++dst32;
			++src32;
			--count;
		}
#ifdef USE_LSB
		*dst32 = internal_mean_value(src32[0], (src32[0] >> 16) | (src32[0] & 0xFFFF0000));
#else
		*dst32 = internal_mean_value(src32[0], (src32[0] << 16) | (src32[0] & 0x0000FFFF));
#endif
	}
}

static inline void internal_mean32_horz_next_step4_def(uint32* dst32, const uint32* src32, unsigned count)
{
	if (count) {
		--count;
		while (count) {
			*dst32 = internal_mean_value(src32[0], src32[1]);
			++dst32;
			++src32;
			--count;
		}
		*dst32 = *src32;
	}
}

static inline void internal_mean8_horz_next_step(uint8* dst8, const uint8* src8, unsigned count, int step)
{
	if (count) {
		--count;
		while (count) {
			*dst8 = internal_mean_value(P8DER0(src8), P8DER(src8, step));
			dst8 += 1;
			PADD(src8, step);
			--count;
		}
		*dst8 = *src8;
	}
}

static inline void internal_mean16_horz_next_step(uint16* dst16, const uint16* src16, unsigned count, int step)
{
	if (count) {
		--count;
		while (count) {
			*dst16 = internal_mean_value(P16DER0(src16), P16DER(src16, step));
			++dst16;
			PADD(src16, step);
			--count;
		}
		*dst16 = *src16;
	}
}

static inline void internal_mean32_horz_next_step(uint32* dst32, const uint32* src32, unsigned count, int step)
{
	if (count) {
		--count;
		while (count) {
			*dst32 = internal_mean_value(P32DER0(src32), P32DER(src32, step));
			++dst32;
			PADD(src32, step);
			--count;
		}
		*dst32 = *src32;
	}
}

#endif