Deferred shading decouples geometry processing from lighting. The geometry pass writes surface attributes into a G-buffer; the lighting pass reads the G-buffer and evaluates lights as fullscreen passes. Lighting complexity becomes O(lights × pixels) rather than O(lights × fragments × geometry).
| Attachment | Format | Contents |
|---|---|---|
| COLOR0 | RGB8 | Albedo (diffuse color) |
| COLOR1 | RGB16F | World-space normal (or view-space) |
| COLOR2 | RGB8 | Metallic / Roughness / AO |
| COLOR3 | RGBA16F | Emissive + misc flags |
| DEPTH | DEPTH24_STENCIL8 | Depth + stencil |
// Create and attach G-buffer textures
glGenFramebuffers(1, &gBuffer);
glBindFramebuffer(GL_FRAMEBUFFER, gBuffer);
GLenum attachments[] = {
GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1,
GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3
};
for (int i = 0; i < 4; ++i) {
glGenTextures(1, &gTextures[i]);
glBindTexture(GL_TEXTURE_2D, gTextures[i]);
glTexImage2D(GL_TEXTURE_2D, 0,
internalFormats[i], width, height, 0,
GL_RGBA, GL_FLOAT, nullptr);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glFramebufferTexture2D(GL_FRAMEBUFFER,
attachments[i], GL_TEXTURE_2D, gTextures[i], 0);
}
glDrawBuffers(4, attachments);
C++
#version 450 core
layout(location = 0) out vec3 gAlbedo;
layout(location = 1) out vec3 gNormal;
layout(location = 2) out vec3 gMRA; // metallic, roughness, AO
layout(location = 3) out vec4 gEmissive;
in vec3 fragPos;
in vec3 fragNormal;
in vec2 fragUV;
uniform sampler2D uAlbedoMap;
uniform sampler2D uNormalMap;
uniform sampler2D uMRAMap;
void main() {
vec3 normal = normalize(fragNormal);
// Encode normal to [0,1] range for storage
gNormal = normal * 0.5 + 0.5;
gAlbedo = texture(uAlbedoMap, fragUV).rgb;
gMRA = texture(uMRAMap, fragUV).rgb;
gEmissive = vec4(0.0);
}
GLSL
GL_NEAREST filtering on G-buffer textures. Bilinear filtering on normals or depth introduces artifacts in the lighting pass where surface properties mix across edges.The lighting pass is a fullscreen triangle (not a quad — a single triangle covering the clip space avoids the diagonal overdraw artifact). Reconstruct world position from depth and the inverse view-projection matrix:
vec3 ReconstructPosition(float depth, vec2 uv, mat4 invVP) {
vec4 clip = vec4(uv * 2.0 - 1.0, depth * 2.0 - 1.0, 1.0);
vec4 world = invVP * clip;
return world.xyz / world.w;
}
GLSL
Render the scene from the light's perspective into a depth texture. During shading, transform the fragment into light-clip space and compare its depth against the stored value.
PCF samples the shadow map at multiple nearby texels and averages the binary results. It does not blur depth values — it blurs the shadow test, which is correct.
float ShadowPCF(sampler2DShadow shadowMap, vec4 lightSpacePos, float bias) {
vec3 projCoords = lightSpacePos.xyz / lightSpacePos.w;
projCoords = projCoords * 0.5 + 0.5;
float shadow = 0.0;
vec2 texelSize = 1.0 / vec2(textureSize(shadowMap, 0));
for (int x = -2; x <= 2; ++x) {
for (int y = -2; y <= 2; ++y) {
vec3 coord = vec3(
projCoords.xy + vec2(x, y) * texelSize,
projCoords.z - bias
);
shadow += texture(shadowMap, coord); // hw comparison
}
}
return shadow / 25.0;
}
GLSL
Split the view frustum into N sub-frusta. Each sub-frustum gets its own shadow map. Near cascades use high resolution; far cascades use lower. Select the correct cascade in the shader by comparing view-space depth against split distances:
int GetCascadeIndex(float viewDepth) {
for (int i = 0; i < NUM_CASCADES - 1; ++i) {
if (viewDepth < cascadeSplits[i])
return i;
}
return NUM_CASCADES - 1;
}
GLSL
Store depth and depth² in an RGBA16F texture. Use Chebyshev's inequality to compute an upper bound on shadow probability. VSM is filterable with hardware bilinear and mipmapping, enabling cheap soft shadows — but suffers from light bleeding when occluders are close together.
float VSMShadow(sampler2D shadowMap, vec2 uv, float fragDepth) {
vec2 moments = texture(shadowMap, uv).rg;
float p = step(fragDepth, moments.x);
float variance = moments.y - (moments.x * moments.x);
variance = max(variance, 0.00002);
float d = fragDepth - moments.x;
float p_max = variance / (variance + d * d); // Chebyshev
return max(p, p_max);
}
GLSL
SSAO approximates ambient occlusion by sampling the depth buffer in a hemisphere around each fragment and counting how many samples are occluded. All computation is in screen space — no geometry is traversed.
// Generate N samples in a unit hemisphere aligned to +Z
std::vector<glm::vec3> GenerateSSAOKernel(int numSamples) {
std::uniform_real_distribution<float> rng(0.0f, 1.0f);
std::default_random_engine gen;
std::vector<glm::vec3> kernel(numSamples);
for (int i = 0; i < numSamples; ++i) {
glm::vec3 s = {
rng(gen) * 2.0f - 1.0f,
rng(gen) * 2.0f - 1.0f,
rng(gen) // hemisphere: z in [0,1]
};
s = glm::normalize(s);
// Accelerate toward origin with lerp
float scale = (float)i / numSamples;
scale = glm::mix(0.1f, 1.0f, scale * scale);
kernel[i] = s * scale;
}
return kernel;
}
C++
const int KERNEL_SIZE = 64;
uniform vec3 uSamples[KERNEL_SIZE];
uniform sampler2D uDepth;
uniform sampler2D uNormal;
uniform sampler2D uNoise; // 4x4 tiled rotation vectors
uniform mat4 uProjection;
void main() {
vec2 uv = fragUV;
vec3 normal = normalize(texture(uNormal, uv).rgb * 2.0 - 1.0);
vec3 randVec = normalize(texture(uNoise, uv * noiseScale).rgb);
// Gram–Schmidt: build TBN from noise + normal
vec3 tangent = normalize(randVec - normal * dot(randVec, normal));
vec3 bitangent = cross(normal, tangent);
mat3 TBN = mat3(tangent, bitangent, normal);
float occlusion = 0.0;
vec3 fragPos = ReconstructViewPos(uv);
for (int i = 0; i < KERNEL_SIZE; ++i) {
vec3 samplePos = fragPos + TBN * uSamples[i] * uRadius;
vec4 offset = uProjection * vec4(samplePos, 1.0);
offset.xyz /= offset.w;
offset.xy = offset.xy * 0.5 + 0.5;
float sampleDepth = texture(uDepth, offset.xy).r;
float rangeCheck = smoothstep(0.0, 1.0,
uRadius / abs(fragPos.z - sampleDepth));
occlusion += (sampleDepth >= samplePos.z + uBias ? 1.0 : 0.0) * rangeCheck;
}
fragColor = vec4(vec3(1.0 - occlusion / KERNEL_SIZE), 1.0);
}
GLSL
After the occlusion pass, apply a separable Gaussian blur to remove the banding noise introduced by the small kernel. Blur horizontally then vertically in two passes. Multiply the result into the ambient term during the lighting pass.
The Cook-Torrance BRDF separates reflectance into diffuse (Lambertian) and specular (microfacet) terms. Three functions govern the specular lobe: the Normal Distribution Function (NDF), the Geometry function, and the Fresnel equation.
// GGX Normal Distribution Function
float D_GGX(float NdotH, float roughness) {
float a = roughness * roughness;
float a2 = a * a;
float d = NdotH * NdotH * (a2 - 1.0) + 1.0;
return a2 / (PI * d * d);
}
// Smith Geometry function (combined masking + shadowing)
float G_Smith(float NdotV, float NdotL, float roughness) {
float r = roughness + 1.0;
float k = (r * r) / 8.0; // direct lighting remapping
float gv = NdotV / (NdotV * (1.0 - k) + k);
float gl = NdotL / (NdotL * (1.0 - k) + k);
return gv * gl;
}
// Fresnel–Schlick
vec3 F_Schlick(float cosTheta, vec3 F0) {
return F0 + (1.0 - F0) * pow(clamp(1.0 - cosTheta, 0.0, 1.0), 5.0);
}
// Full Cook-Torrance specular term
vec3 CookTorranceSpecular(vec3 N, vec3 V, vec3 L,
vec3 F0, float roughness) {
vec3 H = normalize(V + L);
float NdotH = max(dot(N, H), 0.0);
float NdotV = max(dot(N, V), 0.0);
float NdotL = max(dot(N, L), 0.0);
float D = D_GGX(NdotH, roughness);
float G = G_Smith(NdotV, NdotL, roughness);
vec3 F = F_Schlick(max(dot(H, V), 0.0), F0);
return (D * G * F) / (4.0 * NdotV * NdotL + 0.0001);
}
GLSL
Split-sum approximation separates the IBL integral into two pre-computed lookups: a pre-filtered environment map (sampled by reflection vector and roughness level) and a BRDF LUT (indexed by NdotV and roughness). Both are computed offline.
vec3 IBL(vec3 N, vec3 V, vec3 albedo,
float metallic, float roughness) {
vec3 F0 = mix(vec3(0.04), albedo, metallic);
vec3 F = F_SchlickRoughness(max(dot(N, V), 0.0), F0, roughness);
vec3 kD = (1.0 - F) * (1.0 - metallic);
vec3 irradiance = texture(uIrradianceMap, N).rgb;
vec3 diffuse = kD * irradiance * albedo;
vec3 R = reflect(-V, N);
float maxLod = 4.0;
vec3 prefilteredColor = textureLod(
uPrefilteredMap, R, roughness * maxLod).rgb;
vec2 brdf = texture(uBRDFLUT, vec2(
max(dot(N, V), 0.0), roughness)).rg;
vec3 specular = prefilteredColor * (F * brdf.x + brdf.y);
return diffuse + specular;
}
GLSL
Render to an RGBA16F or RGBA32F framebuffer to preserve values beyond [0,1]. A final fullscreen pass converts HDR luminance to LDR for display. Tone mapping operators differ in how they compress the high-luminance range.
| Operator | Formula | Characteristic |
|---|---|---|
| Reinhard | x / (x + 1) |
Simple; desaturates highlights |
| Reinhard (Extended) | x(1 + x/W²) / (1 + x) |
Preserves whites up to max W |
| ACES (Filmic) | Piecewise rational | Film-like contrast; saturated shadows |
| Uncharted 2 | Hable curve | Shoulder control; widely used in games |
// ACES filmic tone map approximation (Narkowicz 2015)
vec3 ACESFilmic(vec3 x) {
const float a = 2.51, b = 0.03, c = 2.43, d = 0.59, e = 0.14;
return clamp((x * (a*x + b)) / (x * (c*x + d) + e), 0.0, 1.0);
}
void main() {
vec3 hdrColor = texture(uHDRBuffer, fragUV).rgb;
// Exposure adjustment before tone mapping
hdrColor *= uExposure;
vec3 ldr = ACESFilmic(hdrColor);
// Gamma correction (linear → sRGB)
ldr = pow(ldr, vec3(1.0 / 2.2));
fragColor = vec4(ldr, 1.0);
}
GLSL
GL_SRGB8_ALPHA8), OpenGL performs gamma correction automatically on write. Do not also apply pow(x, 1/2.2) in the shader — that double-applies gamma.Threshold the HDR buffer to extract bright pixels (luminance above a configurable cutoff). Apply a multi-pass dual Kawase blur or downsample/upsample pyramid. Additively blend the result back into the HDR buffer before tone mapping, not after.
Compute shaders (GL 4.3+) run arbitrary work on the GPU without rasterization. They operate on shader storage buffer objects (SSBOs) and images. Local workgroup size is declared in-shader; global dispatch is issued from the CPU.
#version 450 core
layout(local_size_x = 16, local_size_y = 16) in;
layout(rgba16f, binding = 0) uniform image2D uInput;
layout(rgba16f, binding = 1) uniform image2D uOutput;
void main() {
ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
ivec2 size = imageSize(uInput);
if (coord.x >= size.x || coord.y >= size.y) return;
vec4 pixel = imageLoad(uInput, coord);
// ... process pixel ...
imageStore(uOutput, coord, pixel);
}
GLSL
// Dispatch from C++
glUseProgram(computeProgram);
glBindImageTexture(0, inputTex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16F);
glBindImageTexture(1, outputTex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA16F);
GLuint groupsX = (width + 15) / 16;
GLuint groupsY = (height + 15) / 16;
glDispatchCompute(groupsX, groupsY, 1);
// Ensure writes complete before subsequent read
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
C++
glMemoryBarrier with the appropriate bits between a compute write and a subsequent shader read. Missing barriers produce undefined results — not a deterministic error.Store particle state in an SSBO, update it each frame with a compute shader, then draw the results with a vertex shader that reads the same SSBO. No CPU readback required. Typical setup:
layout(std430, binding = 0) buffer ParticleBuffer {
Particle particles[];
};
void main() {
uint idx = gl_GlobalInvocationID.x;
Particle p = particles[idx];
p.position += p.velocity * uDeltaTime;
p.velocity += uGravity * uDeltaTime;
p.life -= uDeltaTime;
particles[idx] = p;
}
GLSL
Indirect rendering stores draw call parameters in a GPU buffer (GL_DRAW_INDIRECT_BUFFER). The CPU issues a single call; the GPU reads parameters from the buffer. This eliminates per-draw CPU overhead and allows compute shaders to generate, cull, or reorder draw calls before they execute.
struct DrawArraysIndirectCommand {
GLuint count; // vertex count
GLuint instanceCount; // set to 0 to cull
GLuint first; // starting vertex
GLuint baseInstance; // gl_BaseInstance in shader
};
C++
A compute shader tests each object's AABB or bounding sphere against the six frustum planes. If the object fails, it sets instanceCount = 0 in the indirect buffer. The subsequent multi-draw call skips that object at zero CPU cost.
layout(local_size_x = 64) in;
layout(std430, binding = 0) buffer BoundsBuffer { BoundingSphere bounds[]; };
layout(std430, binding = 1) buffer DrawBuffer { DrawCommand draws[]; };
uniform vec4 uFrustumPlanes[6];
bool SphereInFrustum(vec3 center, float radius) {
for (int i = 0; i < 6; ++i) {
if (dot(uFrustumPlanes[i].xyz, center)
+ uFrustumPlanes[i].w + radius < 0.0)
return false;
}
return true;
}
void main() {
uint id = gl_GlobalInvocationID.x;
if (id >= draws.length()) return;
BoundingSphere s = bounds[id];
draws[id].instanceCount = SphereInFrustum(s.center, s.radius) ? 1 : 0;
}
GLSL
glMultiDrawArraysIndirect or glMultiDrawElementsIndirect. Reduce state changes; state changes are expensive, draw calls are cheap when indirect.TAA accumulates samples across multiple frames using reprojection. Each frame is rendered with a sub-pixel jitter applied to the projection matrix. The current frame is reprojected into the previous frame's space using a motion vector buffer, then blended with the history buffer.
// Halton sequence offsets — low-discrepancy, better than pure random
glm::vec2 HaltonJitter(int frame, int width, int height) {
glm::vec2 h = halton23(frame % 16); // bases 2 and 3, 16-frame cycle
return (h - 0.5f) * glm::vec2(2.0f / width, 2.0f / height);
}
// Apply to projection matrix before render
projection[2][0] += jitter.x;
projection[2][1] += jitter.y;
C++
vec2 Reproject(vec2 uv, float depth) {
vec4 clipPos = vec4(uv * 2.0 - 1.0, depth * 2.0 - 1.0, 1.0);
vec4 worldPos = uInvViewProj * clipPos;
worldPos /= worldPos.w;
vec4 prevClip = uPrevViewProj * worldPos;
return (prevClip.xy / prevClip.w) * 0.5 + 0.5;
}
void main() {
vec2 uv = fragUV;
vec2 prevUV = Reproject(uv, texture(uDepth, uv).r);
vec3 current = texture(uCurrentFrame, uv).rgb;
vec3 history = texture(uHistoryBuffer, prevUV).rgb;
// Clamp history to current neighborhood to prevent ghosting
vec3 colorMin = current, colorMax = current;
for(int i = 0; i < 9; ++i) { // 3x3 neighborhood
vec3 s = textureLod(uCurrentFrame, uv + offsets[i], 0.0).rgb;
colorMin = min(colorMin, s);
colorMax = max(colorMax, s);
}
history = clamp(history, colorMin, colorMax);
fragColor = vec4(mix(history, current, 0.1), 1.0);
}
GLSL
For dynamic objects, write per-pixel motion vectors in the geometry pass by projecting the vertex's current and previous world positions into clip space and taking the difference. Store as RG16F. Static objects can derive motion from camera movement alone (the reprojection above covers this case).
// In vertex shader, output both current and previous clip-space positions
out vec4 vCurrentPos;
out vec4 vPrevPos;
void main() {
vec4 worldPos = uModel * vec4(aPosition, 1.0);
vec4 prevWorldPos = uPrevModel * vec4(aPosition, 1.0);
vCurrentPos = uViewProj * worldPos;
vPrevPos = uPrevViewProj * prevWorldPos;
gl_Position = vCurrentPos;
}
GLSL
// In fragment shader, output motion vector
void main() {
vec2 curr = (vCurrentPos.xy / vCurrentPos.w) * 0.5 + 0.5;
vec2 prev = (vPrevPos.xy / vPrevPos.w) * 0.5 + 0.5;
motionOut = curr - prev; // screen-space displacement
}
GLSL
All shader code targets GLSL 4.50 / OpenGL 4.5 Core Profile.
Assumes column-major matrices and a [0,1] depth range unless otherwise noted.