0
0
mirror of https://github.com/obsproject/obs-studio.git synced 2024-09-20 13:08:50 +02:00

obs-filters: Reduce 3D LUT calculations

Single tap is good enough.

Intel UHD Graphics 750: 860 µs -> 591 µs
This commit is contained in:
jpark37 2022-02-16 23:13:19 -08:00 committed by Jim
parent 8e8c29d84c
commit 873e3a0aae
2 changed files with 13 additions and 88 deletions

View File

@ -292,10 +292,12 @@ static void color_grade_filter_update(void *data, obs_data_t *settings)
filter->target = make_clut_texture_png(
filter->image.format, filter->image.cx,
filter->image.cy, filter->image.texture_data);
const float clut_scale = (float)(LUT_WIDTH - 1);
const float width_i = 1.0f / (float)LUT_WIDTH;
const float clut_scale = 1.0f - width_i;
const float offset = 0.5f * width_i;
vec3_set(&filter->clut_scale, clut_scale, clut_scale,
clut_scale);
vec3_set(&filter->clut_offset, 0.f, 0.f, 0.f);
vec3_set(&filter->clut_offset, offset, offset, offset);
} else if (filter->cube_data) {
const uint32_t width = filter->cube_width;
if (filter->clut_dim == CLUT_1D) {
@ -324,16 +326,13 @@ static void color_grade_filter_update(void *data, obs_data_t *settings)
vec3_mul(&filter->clut_offset, &filter->clut_offset,
&filter->clut_scale);
/* 1D shader wants normalized UVW */
if (filter->clut_dim == CLUT_1D) {
vec3_divf(&filter->clut_scale,
&filter->clut_scale, (float)width);
vec3_addf(&filter->clut_offset,
&filter->clut_offset, 0.5f);
vec3_divf(&filter->clut_offset,
&filter->clut_offset, (float)width);
}
/* want normalized UVW */
vec3_divf(&filter->clut_scale, &filter->clut_scale,
(float)width);
vec3_addf(&filter->clut_offset, &filter->clut_offset,
0.5f);
vec3_divf(&filter->clut_offset, &filter->clut_offset,
(float)width);
}
}
@ -455,9 +454,6 @@ static void color_grade_filter_render(void *data, gs_effect_t *effect)
param = gs_effect_get_param_by_name(filter->effect, "domain_max");
gs_effect_set_vec3(param, &filter->domain_max);
param = gs_effect_get_param_by_name(filter->effect, "cube_width_i");
gs_effect_set_float(param, 1.0f / filter->cube_width);
gs_blend_state_push();
gs_blend_function(GS_BLEND_ONE, GS_BLEND_INVSRCALPHA);

View File

@ -8,7 +8,6 @@ uniform float3 clut_scale;
uniform float3 clut_offset;
uniform float3 domain_min;
uniform float3 domain_max;
uniform float cube_width_i;
sampler_state textureSampler {
Filter = Linear;
@ -85,78 +84,8 @@ float4 LUT3D(VertDataOut v_in) : TARGET
g >= domain_min.g && g <= domain_max.g &&
b >= domain_min.b && b <= domain_max.b)
{
float3 clut_pos = nonlinear * clut_scale + clut_offset;
float3 floor_pos = floor(clut_pos);
float3 fracRGB = clut_pos - floor_pos;
float3 uvw0 = (floor_pos + 0.5) * cube_width_i;
float3 uvw3 = (floor_pos + 1.5) * cube_width_i;
float fracL, fracM, fracS;
float3 uvw1, uvw2;
if (fracRGB.r < fracRGB.g) {
if (fracRGB.r < fracRGB.b) {
if (fracRGB.g < fracRGB.b) {
// f(R) < f(G) < f(B)
fracL = fracRGB.b;
fracM = fracRGB.g;
fracS = fracRGB.r;
uvw1 = float3(uvw0.x, uvw0.y, uvw3.z);
uvw2 = float3(uvw0.x, uvw3.y, uvw3.z);
} else {
// f(R) < f(B) <= f(G)
fracL = fracRGB.g;
fracM = fracRGB.b;
fracS = fracRGB.r;
uvw1 = float3(uvw0.x, uvw3.y, uvw0.z);
uvw2 = float3(uvw0.x, uvw3.y, uvw3.z);
}
} else {
// f(B) <= f(R) < f(G)
fracL = fracRGB.g;
fracM = fracRGB.r;
fracS = fracRGB.b;
uvw1 = float3(uvw0.x, uvw3.y, uvw0.z);
uvw2 = float3(uvw3.x, uvw3.y, uvw0.z);
}
} else if (fracRGB.r < fracRGB.b) {
// f(G) <= f(R) < f(B)
fracL = fracRGB.b;
fracM = fracRGB.r;
fracS = fracRGB.g;
uvw1 = float3(uvw0.x, uvw0.y, uvw3.z);
uvw2 = float3(uvw3.x, uvw0.y, uvw3.z);
} else if (fracRGB.g < fracRGB.b) {
// f(G) < f(B) <= f(R)
fracL = fracRGB.r;
fracM = fracRGB.b;
fracS = fracRGB.g;
uvw1 = float3(uvw3.x, uvw0.y, uvw0.z);
uvw2 = float3(uvw3.x, uvw0.y, uvw3.z);
} else {
// f(B) <= f(G) <= f(R)
fracL = fracRGB.r;
fracM = fracRGB.g;
fracS = fracRGB.b;
uvw1 = float3(uvw3.x, uvw0.y, uvw0.z);
uvw2 = float3(uvw3.x, uvw3.y, uvw0.z);
}
/* use filtering to collapse 4 taps to 2 */
/* use max to kill potential zero-divide NaN */
float coeff01 = (1.0 - fracM);
float weight01 = max((fracL - fracM) / coeff01, 0.0);
float3 uvw01 = lerp(uvw0, uvw1, weight01);
float3 sample01 = clut_3d.Sample(textureSampler, uvw01).rgb;
float coeff23 = fracM;
float weight23 = max(fracS / coeff23, 0.0);
float3 uvw23 = lerp(uvw2, uvw3, weight23);
float3 sample23 = clut_3d.Sample(textureSampler, uvw23).rgb;
float3 luttedColor = (coeff01 * sample01) + (coeff23 * sample23);
float3 clut_uvw = nonlinear * clut_scale + clut_offset;
float3 luttedColor = clut_3d.Sample(textureSampler, clut_uvw).rgb;
textureColor.rgb = lerp(textureColor.rgb, luttedColor, clut_amount);
}