From e9ab9d3242cbdda2f39c44cc05cb8bc44704c769 Mon Sep 17 00:00:00 2001 From: gohai Date: Wed, 22 Jan 2020 13:41:45 -0800 Subject: [PATCH 1/5] Throttle the rate of renewal requests (ACME v2 rate limit) The ACME v2 API has a new rate limit of 300 new orders per account per 3 hours, which easily results in the "too many new orders recently" error for new or renewing domains. (see https://github.com/GUI/lua-resty-auto-ssl/issues/213) As a first step to mitigate this new requirement, throttle the rate with which renewal requests are issued to a default 60 per hour (which leaves 40 for on-demand issues per hour). --- README.md | 5 +++++ lib/resty/auto-ssl.lua | 4 ++++ lib/resty/auto-ssl/jobs/renewal.lua | 18 ++++++++++++++++++ spec/config/nginx.conf.etlua | 1 + 4 files changed, 28 insertions(+) diff --git a/README.md b/README.md index 07ae584..e7b5386 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,11 @@ How frequently (in seconds) all of the domains should be checked for certificate auto_ssl:set("renew_check_interval", 172800) ``` +### `renewals_per_hour` +*Default:* `60` + +How many renewal requests to issue per hour at most. The ACME v2 protocol limits each account to 300 new orders per 3 hours. This setting will throttle the renewal job so that a sufficient margin remains available for new domains at all times. You might consider lowering this setting when the same Let's Encrypt account credentials are shared across multiple servers (in a load-balanced environment). + ### `storage_adapter` *Default:* `resty.auto-ssl.storage_adapters.file`
*Options:* `resty.auto-ssl.storage_adapters.file`, `resty.auto-ssl.storage_adapters.redis` diff --git a/lib/resty/auto-ssl.lua b/lib/resty/auto-ssl.lua index 4640bd7..a15b552 100644 --- a/lib/resty/auto-ssl.lua +++ b/lib/resty/auto-ssl.lua @@ -44,6 +44,10 @@ function _M.new(options) options["renew_check_interval"] = 86400 -- 1 day end + if not options["renewals_per_hour"] then + options["renewals_per_hour"] = 60 + end + if not options["hook_server_port"] then options["hook_server_port"] = 8999 end diff --git a/lib/resty/auto-ssl/jobs/renewal.lua b/lib/resty/auto-ssl/jobs/renewal.lua index 4a74a6e..d8c5aa3 100644 --- a/lib/resty/auto-ssl/jobs/renewal.lua +++ b/lib/resty/auto-ssl/jobs/renewal.lua @@ -5,6 +5,8 @@ local shuffle_table = require "resty.auto-ssl.utils.shuffle_table" local ssl_provider = require "resty.auto-ssl.ssl_providers.lets_encrypt" local _M = {} +local min_renewal_seconds +local last_renewal -- Based on lua-rest-upstream-healthcheck's lock: -- https://github.com/openresty/lua-resty-upstream-healthcheck/blob/v0.03/lib/resty/upstream/healthcheck.lua#L423-L440 @@ -184,6 +186,18 @@ local function renew_check_cert(auto_ssl_instance, storage, domain) end renew_check_cert_unlock(domain, storage, local_lock, distributed_lock_value) + + -- Throttle renewal requests based on renewals_per_hour setting. + if last_renewal and ngx.now() - last_renewal < min_renewal_seconds then + local to_sleep = min_renewal_seconds - (ngx.now() - last_renewal) + ngx.log(ngx.NOTICE, "auto-ssl: pausing renewal job for " .. to_sleep .. " seconds") + ngx.sleep(to_sleep) + end + if last_renewal then + last_renewal = last_renewal + min_renewal_seconds + else + last_renewal = ngx.now() + end end local function renew_all_domains(auto_ssl_instance) @@ -199,6 +213,10 @@ local function renew_all_domains(auto_ssl_instance) -- renewal attempts). shuffle_table(domains) + -- Set up renewal request throttling. + min_renewal_seconds = 3600 / auto_ssl_instance:get("renewals_per_hour") + last_renewal = ngx.now() + for _, domain in ipairs(domains) do renew_check_cert(auto_ssl_instance, storage, domain) end diff --git a/spec/config/nginx.conf.etlua b/spec/config/nginx.conf.etlua index 56c45a3..a7b5119 100644 --- a/spec/config/nginx.conf.etlua +++ b/spec/config/nginx.conf.etlua @@ -33,6 +33,7 @@ http { allow_domain = function(domain) return true end, + renewals_per_hour = 3600, } <%- auto_ssl_pre_new or "" %> auto_ssl = (require "resty.auto-ssl").new(options) From bc2a99907cbdf252ed79f8b74fc35bb31266b410 Mon Sep 17 00:00:00 2001 From: gohai Date: Wed, 22 Jan 2020 14:55:50 -0800 Subject: [PATCH 2/5] Consider the time the renewal job took when rescheduling it With throttling, renewal jobs can take a significant time to complete (e.g. when a bunch of domains are renewing around the same time). To make sure we can spread out the renewal as evenly as possible throughout the period, and to ensure the renewal checks on different workers stay in sync, consider the duration of the actual job execution when re-scheduling the check. --- lib/resty/auto-ssl/jobs/renewal.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/resty/auto-ssl/jobs/renewal.lua b/lib/resty/auto-ssl/jobs/renewal.lua index d8c5aa3..c980a26 100644 --- a/lib/resty/auto-ssl/jobs/renewal.lua +++ b/lib/resty/auto-ssl/jobs/renewal.lua @@ -254,12 +254,14 @@ end local function renew(premature, auto_ssl_instance) if premature then return end + local start = ngx.now() local renew_ok, renew_err = pcall(do_renew, auto_ssl_instance) if not renew_ok then ngx.log(ngx.ERR, "auto-ssl: failed to run do_renew cycle: ", renew_err) end - local timer_ok, timer_err = ngx.timer.at(auto_ssl_instance:get("renew_check_interval"), renew, auto_ssl_instance) + local delay = math.max(0, auto_ssl_instance:get("renew_check_interval") - (ngx.now() - start)) + local timer_ok, timer_err = ngx.timer.at(delay, renew, auto_ssl_instance) if not timer_ok then if timer_err ~= "process exiting" then ngx.log(ngx.ERR, "auto-ssl: failed to create timer: ", timer_err) From c89e4a7fee723c49dd820d9888bf964e216416dc Mon Sep 17 00:00:00 2001 From: gohai Date: Mon, 27 Jan 2020 15:12:04 -0800 Subject: [PATCH 3/5] Start renewal job on random timer offset (ACME v2 rate limit) This is to increase the chances that environments using multiple servers won't have all instances renewing at the same time, which might tie up all orders available to the account, and thus prevent certificates for new domains from being issued. --- lib/resty/auto-ssl/init_worker.lua | 3 ++- lib/resty/auto-ssl/jobs/renewal.lua | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/resty/auto-ssl/init_worker.lua b/lib/resty/auto-ssl/init_worker.lua index e2fce9b..d273721 100644 --- a/lib/resty/auto-ssl/init_worker.lua +++ b/lib/resty/auto-ssl/init_worker.lua @@ -2,6 +2,7 @@ local random_seed = require "resty.auto-ssl.utils.random_seed" local renewal_job = require "resty.auto-ssl.jobs.renewal" local shell_blocking = require "shell-games" local start_sockproc = require "resty.auto-ssl.utils.start_sockproc" +local timer_rand = math.random() return function(auto_ssl_instance) local base_dir = auto_ssl_instance:get("dir") @@ -37,5 +38,5 @@ return function(auto_ssl_instance) storage_adapter:setup_worker() end - renewal_job.spawn(auto_ssl_instance) + renewal_job.spawn(auto_ssl_instance, timer_rand) end diff --git a/lib/resty/auto-ssl/jobs/renewal.lua b/lib/resty/auto-ssl/jobs/renewal.lua index c980a26..13f661c 100644 --- a/lib/resty/auto-ssl/jobs/renewal.lua +++ b/lib/resty/auto-ssl/jobs/renewal.lua @@ -270,8 +270,8 @@ local function renew(premature, auto_ssl_instance) end end -function _M.spawn(auto_ssl_instance) - local ok, err = ngx.timer.at(auto_ssl_instance:get("renew_check_interval"), renew, auto_ssl_instance) +function _M.spawn(auto_ssl_instance, timer_rand) + local ok, err = ngx.timer.at(timer_rand * auto_ssl_instance:get("renew_check_interval"), renew, auto_ssl_instance) if not ok then ngx.log(ngx.ERR, "auto-ssl: failed to create timer: ", err) return From 6e1ea73b2ff2d52a89e0515017e6b5143be56460 Mon Sep 17 00:00:00 2001 From: gohai Date: Wed, 22 Jan 2020 17:30:35 -0800 Subject: [PATCH 4/5] Track Let's Encrypt certificate failures per-domain (ACME v2 rate limit) This implements the minimal functionality for considering a domain's past certificate failures inside allow_domain. We believe something like this to be necessary to be able to comply to the new ACME v2 limit of 300 new orders/account/3h. A different approach which might be worth considering is saving the statistical information to (permanent) storage, like we do with certificates. This patch instead only uses a shm-based dictionary, meaning that information might be stale in a multi-server setup. (Dictionary entries are created with an expiration date, and we're running the following patch to look for a certificate in storage before calling allow_domain, meaning that this shouldn't be a problem in practice: https://github.com/Cargo/lua-resty-auto-ssl/commit/b1f9715dd24a4ad4c1204d638fac10b2d0d23830) --- README.md | 55 ++++++++++++++++++++++++ lib/resty/auto-ssl.lua | 58 ++++++++++++++++++++++++++ lib/resty/auto-ssl/jobs/renewal.lua | 4 ++ lib/resty/auto-ssl/ssl_certificate.lua | 3 ++ 4 files changed, 120 insertions(+) diff --git a/README.md b/README.md index e7b5386..4acffda 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,61 @@ auto_ssl:set("allow_domain", function(domain, auto_ssl, ssl_options, renewal) end) ``` +#### `get_failures` + +The optional `get_failures` function accepts a domain name argument, and can be used to retrieve statistics about failed certificate requests concerning the domain. The function will return a table with fields `first` (timestamp of first failure encountered), `last` (timestamp of most recent failure encountered), `num` (number of failures). The function will instead return `nil` if no error has been encountered. + +Note: the statistics are only kept for as long as the nginx instance is running. There is no sharing across multiple servers (as in a load-balanced environment) implemented. + +To make use of the `get_failures` function, add the following to the `http` configuration block: + +```nginx + lua_shared_dict auto_ssl_failures 1m; +``` + +When this shm-based dictionary exists, `lua-resty-auto-ssl` will use it to update a record it keeps for the domain whenever a Let's Encrypt certificate request fails (for both new domains, as well as renewing ones). When a certificate request is successful, `lua-resty-auto-ssl` will delete the record it has for the domain, so that future invocations will return `nil`. + +The `get_failures` function can be used inside `allow_domain` to implement per-domain rate-limiting, and similar rule sets. + +*Example:* + +```lua +auto_ssl:set("allow_domain", function(domain, auto_ssl, ssl_options, renewal) + local failures = auto_ssl:get_failures(domain) + -- only attempt one certificate request per hour + if not failures or 3600 < ngx.now() - failures["last"] then + return true + else + return false + end +end) +``` + +#### `track_failure` + +The optional `track_failure` function accepts a domain name argument and records a failure for this domain. This can be used to avoid repeated lookups of a domain in `allow_domain`. + +*Example:* + +```lua +auto_ssl:set("allow_domain", function(domain, auto_ssl, ssl_options, renewal) + local failures = auto_ssl:get_failures(domain) + -- only attempt one lookup or certificate request per hour + if failures and ngx.now() - failures["last"] <= 3600 then + return false + end + + local allow + -- (external lookup to check domain, e.g. via http) + if not allow then + auto_ssl:track_failure(domain) + return false + else + return true + end +end) +``` + ### `dir` *Default:* `/etc/resty-auto-ssl` diff --git a/lib/resty/auto-ssl.lua b/lib/resty/auto-ssl.lua index a15b552..094f522 100644 --- a/lib/resty/auto-ssl.lua +++ b/lib/resty/auto-ssl.lua @@ -99,4 +99,62 @@ function _M.hook_server(self) server(self) end +function _M.get_failures(self, domain) + if not ngx.shared.auto_ssl_failures then + ngx.log(ngx.ERR, "auto-ssl: dict auto_ssl_failures could not be found. Please add it to your configuration: `lua_shared_dict auto_ssl_failures 1m;`") + return + end + + local string = ngx.shared.auto_ssl_failures:get("domain:" .. domain) + if string then + local failures, json_err = self.storage.json_adapter:decode(string) + if json_err then + ngx.log(ngx.ERR, json_err, domain) + end + if failures then + local mt = { + __concat = function(op1, op2) + return tostring(op1) .. tostring(op2) + end, + __tostring = function(f) + return "first: " .. f["first"] .. ", last: " .. f["last"] .. ", num: " .. f["num"] + end + } + setmetatable(failures, mt) + return failures + end + end +end + +function _M.track_failure(self, domain) + if not ngx.shared.auto_ssl_failures then + return + end + + local failures + local string = ngx.shared.auto_ssl_failures:get("domain:" .. domain) + if string then + failures = self.storage.json_adapter:decode(string) + end + if not failures then + failures = {} + failures["first"] = ngx.now() + failures["last"] = failures["first"] + failures["num"] = 1 + else + failures["last"] = ngx.now() + failures["num"] = failures["num"] + 1 + end + string = self.storage.json_adapter:encode(failures) + ngx.shared.auto_ssl_failures:set("domain:" .. domain, string, 2592000) +end + +function _M.track_success(_, domain) + if not ngx.shared.auto_ssl_failures then + return + end + + ngx.shared.auto_ssl_failures:delete("domain:" .. domain) +end + return _M diff --git a/lib/resty/auto-ssl/jobs/renewal.lua b/lib/resty/auto-ssl/jobs/renewal.lua index 13f661c..c0f5430 100644 --- a/lib/resty/auto-ssl/jobs/renewal.lua +++ b/lib/resty/auto-ssl/jobs/renewal.lua @@ -183,6 +183,10 @@ local function renew_check_cert(auto_ssl_instance, storage, domain) ngx.log(ngx.WARN, "auto-ssl: existing certificate is expired, deleting: ", domain) storage:delete_cert(domain) end + + auto_ssl_instance:track_failure(domain) + else + auto_ssl_instance:track_success(domain) end renew_check_cert_unlock(domain, storage, local_lock, distributed_lock_value) diff --git a/lib/resty/auto-ssl/ssl_certificate.lua b/lib/resty/auto-ssl/ssl_certificate.lua index ab71cfd..69f838e 100644 --- a/lib/resty/auto-ssl/ssl_certificate.lua +++ b/lib/resty/auto-ssl/ssl_certificate.lua @@ -95,6 +95,9 @@ local function issue_cert(auto_ssl_instance, storage, domain) cert, err = ssl_provider.issue_cert(auto_ssl_instance, domain) if err then ngx.log(ngx.ERR, "auto-ssl: issuing new certificate failed: ", err) + auto_ssl_instance:track_failure(domain) + else + auto_ssl_instance:track_success(domain) end issue_cert_unlock(domain, storage, local_lock, distributed_lock_value) From 1ad9b3b99e6251458fa1a1b54ad79b3cd7222458 Mon Sep 17 00:00:00 2001 From: gohai Date: Tue, 7 Apr 2020 14:17:31 -0700 Subject: [PATCH 5/5] Do renewals gradually (ACME v2 rate limit) Previously, renewals were attempted 30 days before domains expired, and then every day going forward - until either the renewal succeeded or the existing certificate expired. This poses an issue with the new ACME v2 rate limit ("too many new orders recently"), because large numbers of domains might be expiring (and thus renewing) at the same time - e.g. because they all got moved onto lua-resty-auto-ssl the same time initially. Do spread the renewal dates out more evenly, this patch rolls the dice whether a domain should be renewed or not, with increasing odds it will be, between 30 and 15 days before the domain expires. Renewals for domains expiring in less than 15 days are always attempted. --- lib/resty/auto-ssl/jobs/renewal.lua | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/resty/auto-ssl/jobs/renewal.lua b/lib/resty/auto-ssl/jobs/renewal.lua index c0f5430..53112a3 100644 --- a/lib/resty/auto-ssl/jobs/renewal.lua +++ b/lib/resty/auto-ssl/jobs/renewal.lua @@ -127,12 +127,22 @@ local function renew_check_cert(auto_ssl_instance, storage, domain) end -- If expiry date is known, attempt renewal if it's within 30 days. + -- Between 30 and 15 days out, only attempt renewal of a subset of domains (with + -- increasing likelihood of renewal being attempted). if cert["expiry"] then local now = ngx.now() if now + (30 * 24 * 60 * 60) < cert["expiry"] then ngx.log(ngx.NOTICE, "auto-ssl: expiry date is more than 30 days out, skipping renewal: ", domain) renew_check_cert_unlock(domain, storage, local_lock, distributed_lock_value) return + elseif now + (15 * 24 * 60 * 60) < cert["expiry"] then + local rand_value = math.random(cert["expiry"] - (30 * 24 * 60 * 60), cert["expiry"] - (15 * 24 * 60 * 60)) + local rand_renewal_threshold = now + if rand_value < rand_renewal_threshold then + ngx.log(ngx.NOTICE, "auto-ssl: expiry date is more than 15 days out, randomly not picked for renewal: ", domain) + renew_check_cert_unlock(domain, storage, local_lock, distributed_lock_value) + return + end end end