Varnish doesn't keep serving cache when the backends are down

Question

I have Varnish running behind HAProxy and in front of NGINX. The HAProxy server deals with SSL, then forwards the traffic on port 80 to Varnish, which in turn refers to the NGINX servers.

I'd like to set Varnish up so that if the NGINX servers are all down, it continues to serve the cached content until they're back up. But I don't seem to be able to get it quite right. I'm running varnish-6.0.8 revision 97e54ada6ac578af332e52b44d2038bb4fa4cd4a. My VCL version is 4.1.

Here is my configuration (sanitized):

vcl 4.1;
import directors;
import std;

# Define Server A
    backend server-a {
        .host = "xx.xx.xx.xx";
        .port = "80";
        .max_connections = 100;
        .probe = {
            .url = "/varnish-check.txt";
            .timeout = 1s;
            .interval = 5s;
            .window = 5;
            .threshold = 3;
}
}
# Define Server B
    backend server-b {
        .host = "xx.xx.xx.xx";
        .port = "80";
        .max_connections = 100;
        .probe = {
            .url = "/varnish-check.txt";
            .timeout = 1s;
            .interval = 5s;
            .window = 5;
            .threshold = 3;
}
}

# Define Server C
    backend server-c {
        .host = "xx.xx.xx.xx";
        .port = "8080";
        .max_connections = 100;
        .probe = {
            .url = "/varnish-check.txt";
            .timeout = 1s;
            .interval = 5s;
            .window = 5;
            .threshold = 3;
}
}

sub vcl_recv {
    if (req.http.host == "example.com" || req.http.host == "example2.com") {
    set req.backend_hint = server_b.backend();
} 
    elseif (req.http.host == "example3.com") {
    set req.backend_hint = server_c.backend();
}
    else {
    set req.backend_hint = server_a.backend();
}
}

acl purge {
    "localhost";
    "127.0.0.1";
    "::1";
    "xx.xx.xx.xx";
    "<IPv6>";
}

sub vcl_recv {

    set req.http.X-Forwarded-For = regsub(req.http.X-Forwarded-For,"^([^,]+)(,[^,]+)*","\1");

    if (req.method == "PURGE") {
    if (!client.ip ~ purge) {
    return (synth(405, "This IP is not allowed to send PURGE requests."));
}
    if (req.http.X-Purge-Method == "regex") {
    ban("obj.http.x-url ~ " + req.url + " && obj.http.x-host ~ " + req.http.host);
    return (synth(200, "Banned"));
}
    return (purge);
}
# Wordpress: don't cache these special pages
    if (req.url ~ "(wp-admin|post\.php|edit\.php|wp-login)") {
    return(pass);
}
# Wordpress: don't cache users who are logged-in or on password-protected pages
    if (req.http.Cookie ~ "wordpress_logged_in_|resetpass|wp-postpass_") {
    return(pass);
}
# Remove cookies
    set req.http.Cookie = regsuball(req.http.Cookie, "comment_author_[a-zA-Z0-9_]+", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "has_js=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "wp-settings-1=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "wp-settings-time-1=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "wordpress_test_cookie=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "PHPSESSID=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "__utm.=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "_ga=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "utmctr=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "utmcmd.=[^;]+(; )?", "");
    set req.http.Cookie = regsuball(req.http.Cookie, "utmccn.=[^;]+(; )?", "");
# Remove proxy header (see https://httpoxy.org/#mitigate-varnish)
    unset req.http.proxy;
# Normalize query arguments (sort alphabetically)
    set req.url = std.querysort(req.url);
# Strip trailing ? if it exists
    if (req.url ~ "\?$") {
    set req.url = regsub(req.url, "\?$", "");
}
# Limit requests to the following types
    if (req.method !~ "^GET|HEAD|PUT|POST|TRACE|OPTIONS|PATCH|DELETE$") {
    return (pipe);
}
# Only cache GET or HEAD requests to ensure that POST requests are always passed through, along with their cookies
    if (req.method != "GET" && req.method != "HEAD") {
    return (pass);
}
# Don't cache AJAX requests
    if (req.http.X-Requested-With == "XMLHttpRequest") {
    return(pass);
}
# Don't cache images and PDFs
    if (req.url ~ "\.(gif|jpg|jpeg|bmp|png|pdf)$") {
    return(pass);
}
# Don't cache large files (zip, audio, video, etc.)
    if (req.url ~ "^[^?]*\.(7z|avi|bz2|flac|flv|gz|mka|mkv|mov|mp3|mp4|mpeg|mpg|ogg|ogm|opus|rar|tar|tgz|tbz|txz|wav|webm|wmv|xz|zip)(\?.*)?$") {
    return (pipe);
}
# Add support for ESI
    if (req.http.Authorization) {
    return (pass);
}

# Wordpress: don't cache search results
    if (req.url ~ "/\?s=") {
    return (pass);
}
# Wordpress: don't cache REST API (hand-rolled APIs used by custom themes)
    if (req.url ~ "/shared-gc/includes/rest-api/") {
    return (pass);
}
# Wordpress: don't cache anything with a cache-breaking v=<random> parameter (see gc.loadCachedJSON() JS function)
    if (req.url ~ "(\?|&)v=0") {
    return (pass);
}
# Don't cache the special pages we use to generate PDFs from the Wordpress catalog site
    if (req.url ~ "/generate-catalog/") {
    return (pass);
}
# Respect the browser's desire for a fresh copy on hard refresh. This ban will only work if there are no further URL changes (e.g. set req.url = ...) after it
    if (req.http.Cache-Control == "no-cache") {
    ban("req.http.host == " + req.http.host + " && req.url == " + req.url);
}
# Are there cookies left with only spaces or that are empty?
    if (req.http.cookie ~ "^\s*$") {
    unset req.http.cookie;
}
# Remove all cookies to enable caching
    unset req.http.Cookie;
    return (hash);
}

sub vcl_hash {

# Ignore marketing-related url parameters when caching urls
    set req.http.newUrl = req.url;
    if (req.http.newUrl ~ "(\?|&)(utm_source|utm_medium|utm_campaign|utm_content|gclid|fbclid|cx|ie|cof|siteurl|gc_source|mkt_tok)=") {
    set req.http.newUrl = regsuball(req.http.newUrl, "&(utm_source|utm_medium|utm_campaign|utm_content|gclid|fbclid|cx|ie|cof|siteurl|gc_source|mkt_tok)=([A-z0-9_\-\.%25]+)", "");
    set req.http.newUrl = regsuball(req.http.newUrl, "\?(utm_source|utm_medium|utm_campaign|utm_content|gclid|fbclid|cx|ie|cof|siteurl|gc_source|mkt_tok)=([A-z0-9_\-\.%25]+)", "?");
    set req.http.newUrl = regsub(req.http.newUrl, "\?&", "?");
    set req.http.newUrl = regsub(req.http.newUrl, "\?$", "");
}
# Ignore hashes when caching urls
    if (req.http.newUrl ~ "\#") {
    set req.http.newUrl = regsub(req.http.newUrl, "\#.*$", "");
}
# Default vcl_hash, except replaced "req.url" with "req.http.newUrl"
    hash_data(req.http.newUrl);
    if (req.http.host) {
    hash_data(req.http.host);
}       else {
    hash_data(server.ip);
}
    return (lookup);
}

sub vcl_backend_response {
    set beresp.http.x-url = bereq.url;
    set beresp.http.x-host = bereq.http.host;
# Set the TTL for the cache to thirty days and the grace period to twelve hours
    set beresp.ttl = 30d;
    set beresp.grace = 12h;
    set beresp.keep = 24h;
# Set different TTLs for other hosts
#   if (bereq.url ~ "(example.com|secondexample.com)") {
#       set beresp.ttl = 30d;
#}
# Set 301 and 302 as uncacheable
    if (beresp.status == 301 || beresp.status == 302) {
    set beresp.http.Location = regsub(beresp.http.Location, ":[0-9]+", "");
# Don't cache redirects
    set beresp.uncacheable = true;
}
# Cache 404 responses for five minutes (can be cleared by hard refresh)
    if (beresp.status == 403 || beresp.status == 404)
{
    set beresp.ttl = 5m;
}
# Check for the response status of background fetches from backend, and return (abandon) if the response is a “5XX” errors
    if (bereq.is_bgfetch && beresp.status >= 500 && beresp.status <= 599) {
        return (abandon);
    }
}

sub vcl_deliver {

    unset resp.http.x-url;
    unset resp.http.x-host;
# Add debug header to see if it's a HIT/MISS and the number of hits, disable when not needed
    if (obj.hits > 0) {
    set resp.http.X-Cache = "HIT";
}       else {
    set resp.http.X-Cache = "MISS";
}

    set resp.http.X-Cache-Hits = obj.hits;  

# Remove headers to improve security
    unset resp.http.X-Varnish;
    unset resp.http.Via;
    unset resp.http.X-Powered-By;
    unset resp.http.Server;
}

sub vcl_init {
    return (ok);
}

sub vcl_fini {
    return (ok);
}

sub vcl_hit {

# If the object has a TTL equal to or greater than 0s, deliver it from the cache
    if (obj.ttl >= 0s) {
    return (deliver);
}
    
# Check whether Grace Mode is necessary 
    if (std.healthy(req.backend_hint)) {
    if (obj.ttl + 10s > 0s) {
    set req.http.grace = "normal(limited)";
    return (deliver);
}       else {

# If Grace Mode is not necessary, fetch a fresh object from the backend
    return(miss);
}
}       else {

# If Grace Mode is necessary (i.e. the backend is down), enter grace mode
    if (obj.ttl + obj.grace > 0s) {
        set req.http.grace = "full";
        return (deliver);
}       else {

# If there is no Grace Mode object available, send to the backend
    return (miss);
}
}
}

This doesn't seem to work, but I can't see why. If a backend server goes down, the Varnish server quickly returns a 503 or other error. Any pointers as to what I've got wrong would be appreciated.

Relatedly, I really like the ability to clear the cache with a hard browser refresh. But it strikes me that it would be great to be able to negate that rule if the backend is down. Any idea how I go about that?

And, of course, if there are any obvious errors in here, I'd love to hear about them.

Thanks!

Thijs Feryn Thijs Feryn · Accepted Answer · 2021-10-19T08:03:48

Apparently you have some logic in vcl_hit that inspects the backend health and interferes with the TTL and grace value.

However, there's a simpler way of handling what is called Stale If Error. Here's the VCL code you need:

vcl 4.1;

import std;

backend default {
    .host = "xx.xx.xx.xx";
    .port = "80";
    .max_connections = 100;
    .probe = {
        .url = "/varnish-check.txt";
        .timeout = 1s;
        .interval = 5s;
        .window = 5;
        .threshold = 3;
    }
}

sub vcl_recv {
    if (std.healthy(req.backend_hint)) {
        set req.grace = 10s;
    }
}

sub vcl_backend_response {
    set beresp.grace = 24h;
}

Total object lifetime

To understand the logic behind the VCL code, you need to understand how Varnish determines the object lifetime.

The total object lifetime is the sum of the following elements:

Total object lifetime = TTL + grace +keep

As long as the sum of these elements is greater than zero, the object is kept around in cache. That doesn't mean revalidation doesn't happen.

If the remaining TTL drops below zero, Varnish attempts to connect to the origin server. But if there's grace left, it will do this asynchronously while it's serving the stale content to the client.

If the backend is down, the stale content is still served.

If the object has expired and is out of grace, synchronous revalidation happens. If there is some keep time left, the potential ETag and Last-Modified headers of the expired object are used and converted into If-None-Match and If-Modified-Since backend request headers.

Conditional requests are the only real feature of keep time, because synchronous revalidation puts client requests in the queue, unlike grace mode.

How to leverage grace mode for stale if error

Grace mode is Varnish's implementation of stale while revalidate and can be set in VCL using beresp.grace but also via Cache-Control: stale-while-revalidate=3600.

What we're trying to do here is take advantage of grace mode's capability to send stale content to the client while revalidating asynchronously.

As the VCL example shows we're saving the object with a grace value of 24 hours. However, when requesting the object, we're only using 10 seconds of grace as long as the backend is healthy.

If it turns out the backend doesn't respond, the original grace of 24 hours is used.

Using grace for stale if error works, but it's a bit of a hack

See https://www.youtube.com/watch?v=51WUTB1cUeM for a 2-minute video about grace mode.

Using vmod_stale

Varnish also has a proper implementation of stale if error, but it's only available in Varnish Enterprise.

We specifically built vmod_stale to handle situations where backends are down.

Here's some example VCL code that uses vmod_stale:

vcl 4.1;

import stale;

backend default {
    .host = "xx.xx.xx.xx";
    .port = "80";
    .max_connections = 100;
    .probe = {
        .url = "/varnish-check.txt";
        .timeout = 1s;
        .interval = 5s;
        .window = 5;
        .threshold = 3;
    }
}

sub stale_if_error {
    if (beresp.status >= 500 && stale.exists()) {
        stale.revive(20m, 1h);
        stale.deliver();
        return (abandon);
    }
}

sub vcl_backend_response {

    call stale_if_error;
}

sub vcl_backend_error {
    call stale_if_error;
}

This VCL example leverages stale.revive() to set new values for TTL and grace while respecting the original total expiration time.

If the new TTL and grace time in combination with the existing keep time exceed the total life time of the object, the overflow is deducted from the keep time.

See https://www.youtube.com/watch?v=6LY4Idt1e2Q for a video about this VMOD.

Varnish doesn't keep serving cache when the backends are down

1 Answers

Total object lifetime

How to leverage grace mode for stale if error

Using vmod_stale