mirror of
https://github.com/obsproject/obs-studio.git
synced 2024-09-20 13:08:50 +02:00
aa22b61e3e
The cache coherency of rasterization for full-screen passes is better using an oversized triangle that is clipped rather than two triangles. Traversal order of rasterization is GPU-specific, but will almost certainly be better using an undivided primitive. A smaller benefit is that quads along the diagonal are not evaluated multiple times, but that's minor in comparison. Redo format shaders to bypass vertex buffer, and input layout. Add global shader bool "obs_glsl_compile" to make API-specific decisions, i.e. handle upside-down UVs. gl_ortho is not needed for format conversion because the vertex shader does not use ViewProj anymore. This can be applied to more situations, but start small first. Testbed full screen passes, Intel HD Graphics 530: RGBA -> UYVX: 467 -> 439 us, ~6% savings UYVX -> uv: 295 -> 239 us, ~19% savings
556 lines
14 KiB
Plaintext
556 lines
14 KiB
Plaintext
/******************************************************************************
|
|
Copyright (C) 2014 by Hugh Bailey <obs.jim@gmail.com>
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
******************************************************************************/
|
|
|
|
//#define DEBUGGING
|
|
|
|
uniform float u_plane_offset;
|
|
uniform float v_plane_offset;
|
|
|
|
uniform float width;
|
|
uniform float height;
|
|
uniform float width_i;
|
|
uniform float height_i;
|
|
uniform float width_d2;
|
|
uniform float height_d2;
|
|
uniform float width_d2_i;
|
|
uniform float height_d2_i;
|
|
uniform float input_width;
|
|
uniform float input_height;
|
|
uniform float input_width_i;
|
|
uniform float input_height_i;
|
|
uniform float input_width_i_d2;
|
|
uniform float input_height_i_d2;
|
|
|
|
uniform int int_width;
|
|
uniform int int_input_width;
|
|
uniform int int_u_plane_offset;
|
|
uniform int int_v_plane_offset;
|
|
|
|
uniform float4x4 color_matrix;
|
|
uniform float3 color_range_min = {0.0, 0.0, 0.0};
|
|
uniform float3 color_range_max = {1.0, 1.0, 1.0};
|
|
|
|
uniform texture2d image;
|
|
|
|
sampler_state def_sampler {
|
|
Filter = Linear;
|
|
AddressU = Clamp;
|
|
AddressV = Clamp;
|
|
};
|
|
|
|
struct VertInOut {
|
|
float4 pos : POSITION;
|
|
float2 uv : TEXCOORD0;
|
|
};
|
|
|
|
VertInOut VSDefault(uint id : VERTEXID)
|
|
{
|
|
float idHigh = float(id >> 1);
|
|
float idLow = float(id & uint(1));
|
|
|
|
float x = idHigh * 4.0 - 1.0;
|
|
float y = idLow * 4.0 - 1.0;
|
|
|
|
float u = idHigh * 2.0;
|
|
float v = obs_glsl_compile ? (idLow * 2.0) : (1.0 - idLow * 2.0);
|
|
|
|
VertInOut vert_out;
|
|
vert_out.pos = float4(x, y, 0.0, 1.0);
|
|
vert_out.uv = float2(u, v);
|
|
return vert_out;
|
|
}
|
|
|
|
/* used to prevent internal GPU precision issues width fmod in particular */
|
|
#define PRECISION_OFFSET 0.2
|
|
|
|
float4 PSNV12(VertInOut vert_in) : TARGET
|
|
{
|
|
float v_mul = floor(vert_in.uv.y * input_height);
|
|
|
|
float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
|
|
byte_offset += PRECISION_OFFSET;
|
|
|
|
float2 sample_pos[4];
|
|
|
|
if (byte_offset < u_plane_offset) {
|
|
#ifdef DEBUGGING
|
|
return float4(1.0, 1.0, 1.0, 1.0);
|
|
#endif
|
|
|
|
float lum_u = floor(fmod(byte_offset, width)) * width_i;
|
|
float lum_v = floor(byte_offset * width_i) * height_i;
|
|
|
|
/* move to texel centers to sample the 4 pixels properly */
|
|
lum_u += width_i * 0.5;
|
|
lum_v += height_i * 0.5;
|
|
|
|
sample_pos[0] = float2(lum_u, lum_v);
|
|
sample_pos[1] = float2(lum_u += width_i, lum_v);
|
|
sample_pos[2] = float2(lum_u += width_i, lum_v);
|
|
sample_pos[3] = float2(lum_u + width_i, lum_v);
|
|
|
|
float4x4 out_val = float4x4(
|
|
image.Sample(def_sampler, sample_pos[0]),
|
|
image.Sample(def_sampler, sample_pos[1]),
|
|
image.Sample(def_sampler, sample_pos[2]),
|
|
image.Sample(def_sampler, sample_pos[3])
|
|
);
|
|
|
|
return transpose(out_val)[1];
|
|
} else {
|
|
#ifdef DEBUGGING
|
|
return float4(0.5, 0.2, 0.5, 0.2);
|
|
#endif
|
|
|
|
float new_offset = byte_offset - u_plane_offset;
|
|
|
|
float ch_u = floor(fmod(new_offset, width)) * width_i;
|
|
float ch_v = floor(new_offset * width_i) * height_d2_i;
|
|
float width_i2 = width_i*2.0;
|
|
|
|
/* move to the borders of each set of 4 pixels to force it
|
|
* to do bilinear averaging */
|
|
ch_u += width_i;
|
|
ch_v += height_i;
|
|
|
|
sample_pos[0] = float2(ch_u, ch_v);
|
|
sample_pos[1] = float2(ch_u + width_i2, ch_v);
|
|
|
|
return float4(
|
|
image.Sample(def_sampler, sample_pos[0]).rb,
|
|
image.Sample(def_sampler, sample_pos[1]).rb
|
|
);
|
|
}
|
|
}
|
|
|
|
float PSNV12_Y(VertInOut vert_in) : TARGET
|
|
{
|
|
return image.Sample(def_sampler, vert_in.uv.xy).y;
|
|
}
|
|
|
|
float2 PSNV12_UV(VertInOut vert_in) : TARGET
|
|
{
|
|
return image.Sample(def_sampler, vert_in.uv.xy).xz;
|
|
}
|
|
|
|
float4 PSPlanar420(VertInOut vert_in) : TARGET
|
|
{
|
|
float v_mul = floor(vert_in.uv.y * input_height);
|
|
|
|
float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
|
|
byte_offset += PRECISION_OFFSET;
|
|
|
|
float2 sample_pos[4];
|
|
|
|
if (byte_offset < u_plane_offset) {
|
|
#ifdef DEBUGGING
|
|
return float4(1.0, 1.0, 1.0, 1.0);
|
|
#endif
|
|
|
|
float lum_u = floor(fmod(byte_offset, width)) * width_i;
|
|
float lum_v = floor(byte_offset * width_i) * height_i;
|
|
|
|
/* move to texel centers to sample the 4 pixels properly */
|
|
lum_u += width_i * 0.5;
|
|
lum_v += height_i * 0.5;
|
|
|
|
sample_pos[0] = float2(lum_u, lum_v);
|
|
sample_pos[1] = float2(lum_u += width_i, lum_v);
|
|
sample_pos[2] = float2(lum_u += width_i, lum_v);
|
|
sample_pos[3] = float2(lum_u + width_i, lum_v);
|
|
|
|
} else {
|
|
#ifdef DEBUGGING
|
|
return ((byte_offset < v_plane_offset) ?
|
|
float4(0.5, 0.5, 0.5, 0.5) :
|
|
float4(0.2, 0.2, 0.2, 0.2));
|
|
#endif
|
|
|
|
float new_offset = byte_offset -
|
|
((byte_offset < v_plane_offset) ?
|
|
u_plane_offset : v_plane_offset);
|
|
|
|
float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;
|
|
float ch_v = floor(new_offset * width_d2_i) * height_d2_i;
|
|
float width_i2 = width_i*2.0;
|
|
|
|
/* move to the borders of each set of 4 pixels to force it
|
|
* to do bilinear averaging */
|
|
ch_u += width_i;
|
|
ch_v += height_i;
|
|
|
|
/* set up coordinates for next chroma line, in case
|
|
* (width / 2) % 4 == 2, i.e. the current set of 4 pixels is split
|
|
* between the current and the next chroma line; do note that the next
|
|
* chroma line is two source lines below the current source line */
|
|
float ch_u_n = 0. + width_i;
|
|
float ch_v_n = ch_v + height_i * 3;
|
|
|
|
sample_pos[0] = float2(ch_u, ch_v);
|
|
sample_pos[1] = float2(ch_u += width_i2, ch_v);
|
|
|
|
ch_u += width_i2;
|
|
// check if ch_u overflowed the current source and chroma line
|
|
if (ch_u > 1.0) {
|
|
sample_pos[2] = float2(ch_u_n, ch_v_n);
|
|
sample_pos[2] = float2(ch_u_n + width_i2, ch_v_n);
|
|
} else {
|
|
sample_pos[2] = float2(ch_u, ch_v);
|
|
sample_pos[3] = float2(ch_u + width_i2, ch_v);
|
|
}
|
|
}
|
|
|
|
float4x4 out_val = float4x4(
|
|
image.Sample(def_sampler, sample_pos[0]),
|
|
image.Sample(def_sampler, sample_pos[1]),
|
|
image.Sample(def_sampler, sample_pos[2]),
|
|
image.Sample(def_sampler, sample_pos[3])
|
|
);
|
|
|
|
out_val = transpose(out_val);
|
|
|
|
if (byte_offset < u_plane_offset)
|
|
return out_val[1];
|
|
else if (byte_offset < v_plane_offset)
|
|
return out_val[0];
|
|
else
|
|
return out_val[2];
|
|
}
|
|
|
|
float4 PSPlanar444(VertInOut vert_in) : TARGET
|
|
{
|
|
float v_mul = floor(vert_in.uv.y * input_height);
|
|
|
|
float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
|
|
byte_offset += PRECISION_OFFSET;
|
|
|
|
float new_byte_offset = byte_offset;
|
|
|
|
if (byte_offset >= v_plane_offset)
|
|
new_byte_offset -= v_plane_offset;
|
|
else if (byte_offset >= u_plane_offset)
|
|
new_byte_offset -= u_plane_offset;
|
|
|
|
float2 sample_pos[4];
|
|
|
|
float u_val = floor(fmod(new_byte_offset, width)) * width_i;
|
|
float v_val = floor(new_byte_offset * width_i) * height_i;
|
|
|
|
/* move to texel centers to sample the 4 pixels properly */
|
|
u_val += width_i * 0.5;
|
|
v_val += height_i * 0.5;
|
|
|
|
sample_pos[0] = float2(u_val, v_val);
|
|
sample_pos[1] = float2(u_val += width_i, v_val);
|
|
sample_pos[2] = float2(u_val += width_i, v_val);
|
|
sample_pos[3] = float2(u_val + width_i, v_val);
|
|
|
|
float4x4 out_val = float4x4(
|
|
image.Sample(def_sampler, sample_pos[0]),
|
|
image.Sample(def_sampler, sample_pos[1]),
|
|
image.Sample(def_sampler, sample_pos[2]),
|
|
image.Sample(def_sampler, sample_pos[3])
|
|
);
|
|
|
|
out_val = transpose(out_val);
|
|
|
|
if (byte_offset < u_plane_offset)
|
|
return out_val[1];
|
|
else if (byte_offset < v_plane_offset)
|
|
return out_val[0];
|
|
else
|
|
return out_val[2];
|
|
}
|
|
|
|
float GetIntOffsetColor(int offset)
|
|
{
|
|
return image.Load(int3(offset % int_input_width,
|
|
offset / int_input_width,
|
|
0)).r;
|
|
}
|
|
|
|
float4 PSPacked422_Reverse(VertInOut vert_in, int u_pos, int v_pos,
|
|
int y0_pos, int y1_pos) : TARGET
|
|
{
|
|
float y = vert_in.uv.y;
|
|
float odd = floor(fmod(width * vert_in.uv.x + PRECISION_OFFSET, 2.0));
|
|
float x = floor(width_d2 * vert_in.uv.x + PRECISION_OFFSET) *
|
|
width_d2_i;
|
|
|
|
x += input_width_i_d2;
|
|
|
|
float4 texel = image.Sample(def_sampler, float2(x, y));
|
|
float3 yuv = float3(odd > 0.5 ? texel[y1_pos] : texel[y0_pos],
|
|
texel[u_pos], texel[v_pos]);
|
|
yuv = clamp(yuv, color_range_min, color_range_max);
|
|
return saturate(mul(float4(yuv, 1.0), color_matrix));
|
|
}
|
|
|
|
float4 PSPlanar420_Reverse(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
int lum_offset = y * int_width + x;
|
|
int chroma_offset = (y / 2) * (int_width / 2) + x / 2;
|
|
int chroma1 = int_u_plane_offset + chroma_offset;
|
|
int chroma2 = int_v_plane_offset + chroma_offset;
|
|
|
|
float3 yuv = float3(
|
|
GetIntOffsetColor(lum_offset),
|
|
GetIntOffsetColor(chroma1),
|
|
GetIntOffsetColor(chroma2)
|
|
);
|
|
yuv = clamp(yuv, color_range_min, color_range_max);
|
|
return saturate(mul(float4(yuv, 1.0), color_matrix));
|
|
}
|
|
|
|
float4 PSPlanar444_Reverse(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
int lum_offset = y * int_width + x;
|
|
int chroma_offset = y * int_width + x;
|
|
int chroma1 = int_u_plane_offset + chroma_offset;
|
|
int chroma2 = int_v_plane_offset + chroma_offset;
|
|
|
|
float3 yuv = float3(
|
|
GetIntOffsetColor(lum_offset),
|
|
GetIntOffsetColor(chroma1),
|
|
GetIntOffsetColor(chroma2)
|
|
);
|
|
yuv = clamp(yuv, color_range_min, color_range_max);
|
|
return saturate(mul(float4(yuv, 1.0), color_matrix));
|
|
}
|
|
|
|
float4 PSNV12_Reverse(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
int lum_offset = y * int_width + x;
|
|
int chroma_offset = (y / 2) * (int_width / 2) + x / 2;
|
|
int chroma = int_u_plane_offset + chroma_offset * 2;
|
|
|
|
float3 yuv = float3(
|
|
GetIntOffsetColor(lum_offset),
|
|
GetIntOffsetColor(chroma),
|
|
GetIntOffsetColor(chroma + 1)
|
|
);
|
|
yuv = clamp(yuv, color_range_min, color_range_max);
|
|
return saturate(mul(float4(yuv, 1.0), color_matrix));
|
|
}
|
|
|
|
float4 PSY800_Limited(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
float limited = image.Load(int3(x, y, 0)).x;
|
|
float full = saturate((limited - (16.0 / 255.0)) * (255.0 / 219.0));
|
|
return float4(full, full, full, 1.0);
|
|
}
|
|
|
|
float4 PSY800_Full(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
float3 full = image.Load(int3(x, y, 0)).xxx;
|
|
return float4(full, 1.0);
|
|
}
|
|
|
|
float4 PSRGB_Limited(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
float4 rgba = image.Load(int3(x, y, 0));
|
|
rgba.rgb = saturate((rgba.rgb - (16.0 / 255.0)) * (255.0 / 219.0));
|
|
return rgba;
|
|
}
|
|
|
|
float4 PSBGR3_Limited(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width * 3.0 + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
float b = image.Load(int3(x - 1, y, 0)).x;
|
|
float g = image.Load(int3(x, y, 0)).x;
|
|
float r = image.Load(int3(x + 1, y, 0)).x;
|
|
float3 rgb = float3(r, g, b);
|
|
rgb = saturate((rgb - (16.0 / 255.0)) * (255.0 / 219.0));
|
|
return float4(rgb, 1.0);
|
|
}
|
|
|
|
float4 PSBGR3_Full(VertInOut vert_in) : TARGET
|
|
{
|
|
int x = int(vert_in.uv.x * width * 3.0 + PRECISION_OFFSET);
|
|
int y = int(vert_in.uv.y * height + PRECISION_OFFSET);
|
|
|
|
float b = image.Load(int3(x - 1, y, 0)).x;
|
|
float g = image.Load(int3(x, y, 0)).x;
|
|
float r = image.Load(int3(x + 1, y, 0)).x;
|
|
float3 rgb = float3(r, g, b);
|
|
return float4(rgb, 1.0);
|
|
}
|
|
|
|
technique Planar420
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPlanar420(vert_in);
|
|
}
|
|
}
|
|
|
|
technique Planar444
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPlanar444(vert_in);
|
|
}
|
|
}
|
|
|
|
technique NV12
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSNV12(vert_in);
|
|
}
|
|
}
|
|
|
|
technique NV12_Y
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSNV12_Y(vert_in);
|
|
}
|
|
}
|
|
|
|
technique NV12_UV
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSNV12_UV(vert_in);
|
|
}
|
|
}
|
|
|
|
technique UYVY_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPacked422_Reverse(vert_in, 2, 0, 1, 3);
|
|
}
|
|
}
|
|
|
|
technique YUY2_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPacked422_Reverse(vert_in, 1, 3, 2, 0);
|
|
}
|
|
}
|
|
|
|
technique YVYU_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPacked422_Reverse(vert_in, 3, 1, 2, 0);
|
|
}
|
|
}
|
|
|
|
technique I420_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPlanar420_Reverse(vert_in);
|
|
}
|
|
}
|
|
|
|
technique I444_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSPlanar444_Reverse(vert_in);
|
|
}
|
|
}
|
|
|
|
technique NV12_Reverse
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSNV12_Reverse(vert_in);
|
|
}
|
|
}
|
|
|
|
technique Y800_Limited
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSY800_Limited(vert_in);
|
|
}
|
|
}
|
|
|
|
technique Y800_Full
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSY800_Full(vert_in);
|
|
}
|
|
}
|
|
|
|
technique RGB_Limited
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSRGB_Limited(vert_in);
|
|
}
|
|
}
|
|
|
|
technique BGR3_Limited
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSBGR3_Limited(vert_in);
|
|
}
|
|
}
|
|
|
|
technique BGR3_Full
|
|
{
|
|
pass
|
|
{
|
|
vertex_shader = VSDefault(id);
|
|
pixel_shader = PSBGR3_Full(vert_in);
|
|
}
|
|
}
|