/******************************************************************************
    Copyright (C) 2014 by Hugh Bailey <obs.jim@gmail.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/

//#define DEBUGGING

uniform float4x4  ViewProj;

uniform float     u_plane_offset;
uniform float     v_plane_offset;

uniform float     width;
uniform float     height;
uniform float     width_i;
uniform float     height_i;
uniform float     width_d2;
uniform float     height_d2;
uniform float     width_d2_i;
uniform float     height_d2_i;
uniform float     input_width;
uniform float     input_height;
uniform float     input_width_i;
uniform float     input_height_i;
uniform float     input_width_i_d2;
uniform float     input_height_i_d2;

uniform int       int_width;
uniform int       int_input_width;
uniform int       int_u_plane_offset;
uniform int       int_v_plane_offset;

uniform float4x4  color_matrix;
uniform float3    color_range_min = {0.0, 0.0, 0.0};
uniform float3    color_range_max = {1.0, 1.0, 1.0};

uniform texture2d image;

sampler_state def_sampler {
	Filter   = Linear;
	AddressU = Clamp;
	AddressV = Clamp;
};

struct VertInOut {
	float4 pos : POSITION;
	float2 uv  : TEXCOORD0;
};

VertInOut VSDefault(VertInOut vert_in)
{
	VertInOut vert_out;
	vert_out.pos = mul(float4(vert_in.pos.xyz, 1.0), ViewProj);
	vert_out.uv  = vert_in.uv;
	return vert_out;
}

/* used to prevent internal GPU precision issues width fmod in particular */
#define PRECISION_OFFSET 0.2

float4 PSNV12(VertInOut vert_in) : TARGET
{
	float v_mul = floor(vert_in.uv.y * input_height);

	float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
	byte_offset += PRECISION_OFFSET;

	float2 sample_pos[4];

	if (byte_offset < u_plane_offset) {
#ifdef DEBUGGING
		return float4(1.0, 1.0, 1.0, 1.0);
#endif

		float lum_u = floor(fmod(byte_offset, width)) * width_i;
		float lum_v = floor(byte_offset * width_i)    * height_i;

		/* move to texel centers to sample the 4 pixels properly */
		lum_u += width_i  * 0.5;
		lum_v += height_i * 0.5;

		sample_pos[0] = float2(lum_u,            lum_v);
		sample_pos[1] = float2(lum_u += width_i, lum_v);
		sample_pos[2] = float2(lum_u += width_i, lum_v);
		sample_pos[3] = float2(lum_u +  width_i, lum_v);

		float4x4 out_val = float4x4(
			image.Sample(def_sampler, sample_pos[0]),
			image.Sample(def_sampler, sample_pos[1]),
			image.Sample(def_sampler, sample_pos[2]),
			image.Sample(def_sampler, sample_pos[3])
		);

		return transpose(out_val)[1];
	} else {
#ifdef DEBUGGING
		return float4(0.5, 0.2, 0.5, 0.2);
#endif

		float new_offset = byte_offset - u_plane_offset;

		float ch_u = floor(fmod(new_offset, width)) * width_i;
		float ch_v = floor(new_offset * width_i)    * height_d2_i;
		float width_i2 = width_i*2.0;

		/* move to the borders of each set of 4 pixels to force it
		 * to do bilinear averaging */
		ch_u += width_i;
		ch_v += height_i;

		sample_pos[0] = float2(ch_u,             ch_v);
		sample_pos[1] = float2(ch_u + width_i2,  ch_v);
		
		return float4(
				image.Sample(def_sampler, sample_pos[0]).rb,
				image.Sample(def_sampler, sample_pos[1]).rb
				);
	}
}

float PSNV12_Y(VertInOut vert_in) : TARGET
{
	return image.Sample(def_sampler, vert_in.uv.xy).y;
}

float2 PSNV12_UV(VertInOut vert_in) : TARGET
{
	return image.Sample(def_sampler, vert_in.uv.xy).xz;
}

float4 PSPlanar420(VertInOut vert_in) : TARGET
{
	float v_mul = floor(vert_in.uv.y * input_height);

	float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
	byte_offset += PRECISION_OFFSET;

	float2 sample_pos[4];

	if (byte_offset < u_plane_offset) {
#ifdef DEBUGGING
		return float4(1.0, 1.0, 1.0, 1.0);
#endif

		float lum_u = floor(fmod(byte_offset, width)) * width_i;
		float lum_v = floor(byte_offset * width_i)    * height_i;

		/* move to texel centers to sample the 4 pixels properly */
		lum_u += width_i  * 0.5;
		lum_v += height_i * 0.5;

		sample_pos[0] = float2(lum_u,            lum_v);
		sample_pos[1] = float2(lum_u += width_i, lum_v);
		sample_pos[2] = float2(lum_u += width_i, lum_v);
		sample_pos[3] = float2(lum_u +  width_i, lum_v);

	} else {
#ifdef DEBUGGING
		return ((byte_offset < v_plane_offset) ?
				float4(0.5, 0.5, 0.5, 0.5) :
				float4(0.2, 0.2, 0.2, 0.2));
#endif

		float new_offset = byte_offset -
				((byte_offset < v_plane_offset) ?
				u_plane_offset : v_plane_offset);

		float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;
		float ch_v = floor(new_offset * width_d2_i)    * height_d2_i;
		float width_i2 = width_i*2.0;

		/* move to the borders of each set of 4 pixels to force it
		 * to do bilinear averaging */
		ch_u += width_i;
		ch_v += height_i;

		/* set up coordinates for next chroma line, in case
		 * (width / 2) % 4 == 2, i.e. the current set of 4 pixels is split
		 * between the current and the next chroma line; do note that the next
		 * chroma line is two source lines below the current source line */
		float ch_u_n = 0.   + width_i;
		float ch_v_n = ch_v + height_i * 3;

		sample_pos[0] = float2(ch_u,             ch_v);
		sample_pos[1] = float2(ch_u += width_i2, ch_v);

		ch_u += width_i2;
		// check if ch_u overflowed the current source and chroma line
		if (ch_u > 1.0) {
			sample_pos[2] = float2(ch_u_n,            ch_v_n);
			sample_pos[2] = float2(ch_u_n + width_i2, ch_v_n);
		} else {
			sample_pos[2] = float2(ch_u,             ch_v);
			sample_pos[3] = float2(ch_u +  width_i2, ch_v);
		}
	}

	float4x4 out_val = float4x4(
		image.Sample(def_sampler, sample_pos[0]),
		image.Sample(def_sampler, sample_pos[1]),
		image.Sample(def_sampler, sample_pos[2]),
		image.Sample(def_sampler, sample_pos[3])
	);

	out_val = transpose(out_val);

	if (byte_offset < u_plane_offset)
		return out_val[1];
	else if (byte_offset < v_plane_offset)
		return out_val[0];
	else
		return out_val[2];
}

float4 PSPlanar444(VertInOut vert_in) : TARGET
{
	float v_mul = floor(vert_in.uv.y * input_height);

	float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
	byte_offset += PRECISION_OFFSET;

	float new_byte_offset = byte_offset;

	if (byte_offset >= v_plane_offset)
		new_byte_offset -= v_plane_offset;
	else if (byte_offset >= u_plane_offset)
		new_byte_offset -= u_plane_offset;

	float2 sample_pos[4];

	float u_val = floor(fmod(new_byte_offset, width)) * width_i;
	float v_val = floor(new_byte_offset * width_i)    * height_i;

	/* move to texel centers to sample the 4 pixels properly */
	u_val += width_i  * 0.5;
	v_val += height_i * 0.5;

	sample_pos[0] = float2(u_val,            v_val);
	sample_pos[1] = float2(u_val += width_i, v_val);
	sample_pos[2] = float2(u_val += width_i, v_val);
	sample_pos[3] = float2(u_val +  width_i, v_val);

	float4x4 out_val = float4x4(
		image.Sample(def_sampler, sample_pos[0]),
		image.Sample(def_sampler, sample_pos[1]),
		image.Sample(def_sampler, sample_pos[2]),
		image.Sample(def_sampler, sample_pos[3])
	);

	out_val = transpose(out_val);

	if (byte_offset < u_plane_offset)
		return out_val[1];
	else if (byte_offset < v_plane_offset)
		return out_val[0];
	else
		return out_val[2];
}

float GetIntOffsetColor(int offset)
{
	return image.Load(int3(offset % int_input_width,
	                       offset / int_input_width,
	                       0)).r;
}

float4 PSPacked422_Reverse(VertInOut vert_in, int u_pos, int v_pos,
		int y0_pos, int y1_pos) : TARGET
{
	float y = vert_in.uv.y;
	float odd = floor(fmod(width * vert_in.uv.x + PRECISION_OFFSET, 2.0));
	float x = floor(width_d2 * vert_in.uv.x + PRECISION_OFFSET) *
			width_d2_i;

	x += input_width_i_d2;

	float4 texel = image.Sample(def_sampler, float2(x, y));
	float3 yuv = float3(odd > 0.5 ? texel[y1_pos] : texel[y0_pos],
			texel[u_pos], texel[v_pos]);
	yuv = clamp(yuv, color_range_min, color_range_max);
	return saturate(mul(float4(yuv, 1.0), color_matrix));
}

float4 PSPlanar420_Reverse(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	int lum_offset = y * int_width + x;
	int chroma_offset = (y / 2) * (int_width / 2) + x / 2;
	int chroma1    = int_u_plane_offset + chroma_offset;
	int chroma2    = int_v_plane_offset + chroma_offset;

	float3 yuv = float3(
		GetIntOffsetColor(lum_offset),
		GetIntOffsetColor(chroma1),
		GetIntOffsetColor(chroma2)
	);
	yuv = clamp(yuv, color_range_min, color_range_max);
	return saturate(mul(float4(yuv, 1.0), color_matrix));
}

float4 PSPlanar444_Reverse(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	int lum_offset = y * int_width + x;
	int chroma_offset = y * int_width + x;
	int chroma1    = int_u_plane_offset + chroma_offset;
	int chroma2    = int_v_plane_offset + chroma_offset;

	float3 yuv = float3(
		GetIntOffsetColor(lum_offset),
		GetIntOffsetColor(chroma1),
		GetIntOffsetColor(chroma2)
	);
	yuv = clamp(yuv, color_range_min, color_range_max);
	return saturate(mul(float4(yuv, 1.0), color_matrix));
}

float4 PSNV12_Reverse(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	int lum_offset    = y * int_width + x;
	int chroma_offset = (y / 2) * (int_width / 2) + x / 2;
	int chroma        = int_u_plane_offset + chroma_offset * 2;

	float3 yuv = float3(
		GetIntOffsetColor(lum_offset),
		GetIntOffsetColor(chroma),
		GetIntOffsetColor(chroma + 1)
	);
	yuv = clamp(yuv, color_range_min, color_range_max);
	return saturate(mul(float4(yuv, 1.0), color_matrix));
}

float4 PSY800_Limited(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	float limited = image.Load(int3(x, y, 0)).x;
	float full = saturate((limited - (16.0 / 255.0)) * (255.0 / 219.0));
	return float4(full, full, full, 1.0);
}

float4 PSY800_Full(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	float3 full = image.Load(int3(x, y, 0)).xxx;
	return float4(full, 1.0);
}

float4 PSRGB_Limited(VertInOut vert_in) : TARGET
{
	int x = int(vert_in.uv.x * width  + PRECISION_OFFSET);
	int y = int(vert_in.uv.y * height + PRECISION_OFFSET);

	float4 rgba = image.Load(int3(x, y, 0));
	rgba.rgb = saturate((rgba.rgb - (16.0 / 255.0)) * (255.0 / 219.0));
	return rgba;
}

technique Planar420
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPlanar420(vert_in);
	}
}

technique Planar444
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPlanar444(vert_in);
	}
}

technique NV12
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSNV12(vert_in);
	}
}

technique NV12_Y
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSNV12_Y(vert_in);
	}
}

technique NV12_UV
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSNV12_UV(vert_in);
	}
}

technique UYVY_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPacked422_Reverse(vert_in, 2, 0, 1, 3);
	}
}

technique YUY2_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPacked422_Reverse(vert_in, 1, 3, 2, 0);
	}
}

technique YVYU_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPacked422_Reverse(vert_in, 3, 1, 2, 0);
	}
}

technique I420_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPlanar420_Reverse(vert_in);
	}
}

technique I444_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPlanar444_Reverse(vert_in);
	}
}

technique NV12_Reverse
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSNV12_Reverse(vert_in);
	}
}

technique Y800_Limited
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSY800_Limited(vert_in);
	}
}

technique Y800_Full
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSY800_Full(vert_in);
	}
}

technique RGB_Limited
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSRGB_Limited(vert_in);
	}
}