From 08028714b57540a46209b6ab094c0a9cc103830f Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Tue, 13 Feb 2018 13:23:09 -0700 Subject: [PATCH] tls: Synchronize renewals between Caddy instances sharing file storage Also introduce caddy.OnProcessExit which is a list of functions that run before exiting the process cleanly; these do not count as shutdown callbacks, so they do not return errors and must execute quickly. --- caddy.go | 8 +++ caddytls/certificates.go | 76 ++++++++++++++-------- caddytls/client.go | 54 +++++----------- caddytls/filestorage.go | 7 +- caddytls/filestoragesync.go | 123 ++++++++++++++++++++++++++++++++++++ caddytls/maintain.go | 25 ++------ caddytls/storage.go | 4 ++ caddytls/sync_locker.go | 57 ----------------- plugins.go | 8 +++ sigtrap.go | 9 +-- sigtrap_posix.go | 8 +-- 11 files changed, 227 insertions(+), 152 deletions(-) create mode 100644 caddytls/filestoragesync.go delete mode 100644 caddytls/sync_locker.go diff --git a/caddy.go b/caddy.go index dd2d473a..628673a7 100644 --- a/caddy.go +++ b/caddy.go @@ -77,6 +77,14 @@ var ( mu sync.Mutex ) +func init() { + OnProcessExit = append(OnProcessExit, func() { + if PidFile != "" { + os.Remove(PidFile) + } + }) +} + // Instance contains the state of servers created as a result of // calling Start and can be used to access or control those servers. // It is literally an instance of a server type. Instance values diff --git a/caddytls/certificates.go b/caddytls/certificates.go index 2df576ff..29c0c8c2 100644 --- a/caddytls/certificates.go +++ b/caddytls/certificates.go @@ -29,34 +29,6 @@ import ( "golang.org/x/crypto/ocsp" ) -// Certificate is a tls.Certificate with associated metadata tacked on. -// Even if the metadata can be obtained by parsing the certificate, -// we are more efficient by extracting the metadata onto this struct. -type Certificate struct { - tls.Certificate - - // Names is the list of names this certificate is written for. - // The first is the CommonName (if any), the rest are SAN. - Names []string - - // NotAfter is when the certificate expires. - NotAfter time.Time - - // OCSP contains the certificate's parsed OCSP response. - OCSP *ocsp.Response - - // The hex-encoded hash of this cert's chain's bytes. - Hash string - - // configs is the list of configs that use or refer to - // The first one is assumed to be the config that is - // "in charge" of this certificate (i.e. determines - // whether it is managed, how it is managed, etc). - // This field will be populated by cacheCertificate. - // Only meddle with it if you know what you're doing! - configs []*Config -} - // certificateCache is to be an instance-wide cache of certs // that site-specific TLS configs can refer to. Using a // central map like this avoids duplication of certs in @@ -127,6 +99,54 @@ func (certCache *certificateCache) replaceCertificate(oldCert, newCert Certifica return nil } +// reloadManagedCertificate reloads the certificate corresponding to the name(s) +// on oldCert into the cache, from storage. This also replaces the old certificate +// with the new one, so that all configurations that used the old cert now point +// to the new cert. +func (certCache *certificateCache) reloadManagedCertificate(oldCert Certificate) error { + // get the certificate from storage and cache it + newCert, err := oldCert.configs[0].CacheManagedCertificate(oldCert.Names[0]) + if err != nil { + return fmt.Errorf("unable to reload certificate for %v into cache: %v", oldCert.Names, err) + } + + // and replace the old certificate with the new one + err = certCache.replaceCertificate(oldCert, newCert) + if err != nil { + return fmt.Errorf("replacing certificate %v: %v", oldCert.Names, err) + } + + return nil +} + +// Certificate is a tls.Certificate with associated metadata tacked on. +// Even if the metadata can be obtained by parsing the certificate, +// we are more efficient by extracting the metadata onto this struct. +type Certificate struct { + tls.Certificate + + // Names is the list of names this certificate is written for. + // The first is the CommonName (if any), the rest are SAN. + Names []string + + // NotAfter is when the certificate expires. + NotAfter time.Time + + // OCSP contains the certificate's parsed OCSP response. + OCSP *ocsp.Response + + // The hex-encoded hash of this cert's chain's bytes. + Hash string + + // configs is the list of configs that use or refer to + // The first one is assumed to be the config that is + // "in charge" of this certificate (i.e. determines + // whether it is managed, how it is managed, etc). + // This field will be populated by cacheCertificate. + // Only meddle with it if you know what you're doing! + configs []*Config +} + // CacheManagedCertificate loads the certificate for domain into the // cache, from the TLS storage for managed certificates. It returns a // copy of the Certificate that was put into the cache. diff --git a/caddytls/client.go b/caddytls/client.go index 4775a2d1..cdd715b9 100644 --- a/caddytls/client.go +++ b/caddytls/client.go @@ -39,7 +39,7 @@ type ACMEClient struct { AllowPrompts bool config *Config acmeClient *acme.Client - locker Locker + storage Storage } // newACMEClient creates a new ACMEClient given an email and whether @@ -121,10 +121,7 @@ var newACMEClient = func(config *Config, allowPrompts bool) (*ACMEClient, error) AllowPrompts: allowPrompts, config: config, acmeClient: client, - locker: &syncLock{ - nameLocks: make(map[string]*sync.WaitGroup), - nameLocksMu: sync.Mutex{}, - }, + storage: storage, } if config.DNSProvider == "" { @@ -209,13 +206,7 @@ var newACMEClient = func(config *Config, allowPrompts bool) (*ACMEClient, error) // Callers who have access to a Config value should use the ObtainCert // method on that instead of this lower-level method. func (c *ACMEClient) Obtain(name string) error { - // Get access to ACME storage - storage, err := c.config.StorageFor(c.config.CAUrl) - if err != nil { - return err - } - - waiter, err := c.locker.TryLock(name) + waiter, err := c.storage.TryLock(name) if err != nil { return err } @@ -225,7 +216,7 @@ func (c *ACMEClient) Obtain(name string) error { return nil // we assume the process with the lock succeeded, rather than hammering this execution path again } defer func() { - if err := c.locker.Unlock(name); err != nil { + if err := c.storage.Unlock(name); err != nil { log.Printf("[ERROR] Unable to unlock obtain call for %s: %v", name, err) } }() @@ -268,7 +259,7 @@ Attempts: } // Success - immediately save the certificate resource - err = saveCertResource(storage, certificate) + err = saveCertResource(c.storage, certificate) if err != nil { return fmt.Errorf("error saving assets for %v: %v", name, err) } @@ -279,35 +270,30 @@ Attempts: return nil } -// Renew renews the managed certificate for name. This function is -// safe for concurrent use. +// Renew renews the managed certificate for name. It puts the renewed +// certificate into storage (not the cache). This function is safe for +// concurrent use. // // Callers who have access to a Config value should use the RenewCert // method on that instead of this lower-level method. func (c *ACMEClient) Renew(name string) error { - // Get access to ACME storage - storage, err := c.config.StorageFor(c.config.CAUrl) - if err != nil { - return err - } - - waiter, err := c.locker.TryLock(name) + waiter, err := c.storage.TryLock(name) if err != nil { return err } if waiter != nil { log.Printf("[INFO] Certificate for %s is already being renewed elsewhere and stored; waiting", name) waiter.Wait() - return nil // we assume the process with the lock succeeded, rather than hammering this execution path again + return nil // assume that the worker that renewed the cert succeeded; avoid hammering this path over and over } defer func() { - if err := c.locker.Unlock(name); err != nil { + if err := c.storage.Unlock(name); err != nil { log.Printf("[ERROR] Unable to unlock renew call for %s: %v", name, err) } }() // Prepare for renewal (load PEM cert, key, and meta) - siteData, err := storage.LoadSite(name) + siteData, err := c.storage.LoadSite(name) if err != nil { return err } @@ -350,21 +336,15 @@ func (c *ACMEClient) Renew(name string) error { return errors.New("too many renewal attempts; last error: " + err.Error()) } - // Executes Cert renew events caddy.EmitEvent(caddy.CertRenewEvent, name) - return saveCertResource(storage, newCertMeta) + return saveCertResource(c.storage, newCertMeta) } -// Revoke revokes the certificate for name and deltes +// Revoke revokes the certificate for name and deletes // it from storage. func (c *ACMEClient) Revoke(name string) error { - storage, err := c.config.StorageFor(c.config.CAUrl) - if err != nil { - return err - } - - siteExists, err := storage.SiteExists(name) + siteExists, err := c.storage.SiteExists(name) if err != nil { return err } @@ -373,7 +353,7 @@ func (c *ACMEClient) Revoke(name string) error { return errors.New("no certificate and key for " + name) } - siteData, err := storage.LoadSite(name) + siteData, err := c.storage.LoadSite(name) if err != nil { return err } @@ -383,7 +363,7 @@ func (c *ACMEClient) Revoke(name string) error { return err } - err = storage.DeleteSite(name) + err = c.storage.DeleteSite(name) if err != nil { return errors.New("certificate revoked, but unable to delete certificate file: " + err.Error()) } diff --git a/caddytls/filestorage.go b/caddytls/filestorage.go index 67084ef4..9dd2d849 100644 --- a/caddytls/filestorage.go +++ b/caddytls/filestorage.go @@ -38,9 +38,9 @@ var storageBasePath = filepath.Join(caddy.AssetsPath(), "acme") // Storage instance backed by the local disk. The resulting Storage // instance is guaranteed to be non-nil if there is no error. func NewFileStorage(caURL *url.URL) (Storage, error) { - return &FileStorage{ - Path: filepath.Join(storageBasePath, caURL.Host), - }, nil + storage := &FileStorage{Path: filepath.Join(storageBasePath, caURL.Host)} + storage.Locker = &fileStorageLock{caURL: caURL.Host, storage: storage} + return storage, nil } // FileStorage facilitates forming file paths derived from a root @@ -48,6 +48,7 @@ func NewFileStorage(caURL *url.URL) (Storage, error) { // cross-platform way or persisting ACME assets on the file system. type FileStorage struct { Path string + Locker } // sites gets the directory that stores site certificate and keys. diff --git a/caddytls/filestoragesync.go b/caddytls/filestoragesync.go new file mode 100644 index 00000000..4c81ca02 --- /dev/null +++ b/caddytls/filestoragesync.go @@ -0,0 +1,123 @@ +// Copyright 2015 Light Code Labs, LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package caddytls + +import ( + "fmt" + "os" + "sync" + "time" + + "github.com/mholt/caddy" +) + +func init() { + // be sure to remove lock files when exiting the process! + caddy.OnProcessExit = append(caddy.OnProcessExit, func() { + fileStorageNameLocksMu.Lock() + defer fileStorageNameLocksMu.Unlock() + for key, fw := range fileStorageNameLocks { + os.Remove(fw.filename) + delete(fileStorageNameLocks, key) + } + }) +} + +// fileStorageLock facilitates ACME-related locking by using +// the associated FileStorage, so multiple processes can coordinate +// renewals on the certificates on a shared file system. +type fileStorageLock struct { + caURL string + storage *FileStorage +} + +// TryLock attempts to get a lock for name, otherwise it returns +// a Waiter value to wait until the other process is finished. +func (s *fileStorageLock) TryLock(name string) (Waiter, error) { + fileStorageNameLocksMu.Lock() + defer fileStorageNameLocksMu.Unlock() + + // see if lock already exists within this process + fw, ok := fileStorageNameLocks[s.caURL+name] + if ok { + // lock already created within process, let caller wait on it + return fw, nil + } + + // attempt to persist lock to disk by creating lock file + fw = &fileWaiter{ + filename: s.storage.siteCertFile(name) + ".lock", + wg: new(sync.WaitGroup), + } + lf, err := os.OpenFile(fw.filename, os.O_CREATE|os.O_EXCL, 0644) + if err != nil { + if os.IsExist(err) { + // another process has the lock; use it to wait + return fw, nil + } + // otherwise, this was some unexpected error + return nil, err + } + lf.Close() + + // looks like we get the lock + fw.wg.Add(1) + fileStorageNameLocks[s.caURL+name] = fw + + return nil, nil +} + +// Unlock unlocks name. +func (s *fileStorageLock) Unlock(name string) error { + fileStorageNameLocksMu.Lock() + defer fileStorageNameLocksMu.Unlock() + fw, ok := fileStorageNameLocks[s.caURL+name] + if !ok { + return fmt.Errorf("FileStorage: no lock to release for %s", name) + } + os.Remove(fw.filename) + fw.wg.Done() + delete(fileStorageNameLocks, s.caURL+name) + return nil +} + +// fileWaiter waits for a file to disappear; it polls +// the file system to check for the existence of a file. +// It also has a WaitGroup which will be faster than +// polling, for when locking need only happen within this +// process. +type fileWaiter struct { + filename string + wg *sync.WaitGroup +} + +// Wait waits until the lock is released. +func (fw *fileWaiter) Wait() { + start := time.Now() + fw.wg.Wait() + for time.Since(start) < 1*time.Hour { + _, err := os.Stat(fw.filename) + if os.IsNotExist(err) { + return + } + time.Sleep(1 * time.Second) + } +} + +var fileStorageNameLocks = make(map[string]*fileWaiter) // keyed by CA + name +var fileStorageNameLocksMu sync.Mutex + +var _ Locker = &fileStorageLock{} +var _ Waiter = &fileWaiter{} diff --git a/caddytls/maintain.go b/caddytls/maintain.go index 7ce6c5e2..5e867d4b 100644 --- a/caddytls/maintain.go +++ b/caddytls/maintain.go @@ -160,17 +160,12 @@ func RenewManagedCertificates(allowPrompts bool) (err error) { log.Printf("[INFO] Certificate for %v expires in %v, but is already renewed in storage; reloading stored certificate", oldCert.Names, timeLeft) - // get the certificate from storage and cache it - newCert, err := oldCert.configs[0].CacheManagedCertificate(oldCert.Names[0]) + err = certCache.reloadManagedCertificate(oldCert) if err != nil { - log.Printf("[ERROR] Unable to reload certificate for %v into cache: %v", oldCert.Names, err) - continue - } - - // and replace the old certificate with the new one - err = certCache.replaceCertificate(oldCert, newCert) - if err != nil { - log.Printf("[ERROR] Replacing certificate: %v", err) + if allowPrompts { + return err // operator is present, so report error immediately + } + log.Printf("[ERROR] Loading renewed certificate: %v", err) } } @@ -212,21 +207,13 @@ func RenewManagedCertificates(allowPrompts bool) (err error) { // successful renewal, so update in-memory cache by loading // renewed certificate so it will be used with handshakes - - // put the certificate in the cache - newCert, err := oldCert.configs[0].CacheManagedCertificate(renewName) + err = certCache.reloadManagedCertificate(oldCert) if err != nil { if allowPrompts { return err // operator is present, so report error immediately } log.Printf("[ERROR] %v", err) } - - // replace the old certificate with the new one - err = certCache.replaceCertificate(oldCert, newCert) - if err != nil { - log.Printf("[ERROR] Replacing certificate: %v", err) - } } // Deletion queue diff --git a/caddytls/storage.go b/caddytls/storage.go index 8587dd02..05606ed9 100644 --- a/caddytls/storage.go +++ b/caddytls/storage.go @@ -107,6 +107,10 @@ type Storage interface { // in StoreUser. The result is an empty string if there are no // persisted users in storage. MostRecentUserEmail() string + + // Locker is necessary because synchronizing certificate maintenance + // depends on how storage is implemented. + Locker } // ErrNotExist is returned by Storage implementations when diff --git a/caddytls/sync_locker.go b/caddytls/sync_locker.go deleted file mode 100644 index 693f3b87..00000000 --- a/caddytls/sync_locker.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2015 Light Code Labs, LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package caddytls - -import ( - "fmt" - "sync" -) - -var _ Locker = &syncLock{} - -type syncLock struct { - nameLocks map[string]*sync.WaitGroup - nameLocksMu sync.Mutex -} - -// TryLock attempts to get a lock for name, otherwise it returns -// a Waiter value to wait until the other process is finished. -func (s *syncLock) TryLock(name string) (Waiter, error) { - s.nameLocksMu.Lock() - defer s.nameLocksMu.Unlock() - wg, ok := s.nameLocks[name] - if ok { - // lock already obtained, let caller wait on it - return wg, nil - } - // caller gets lock - wg = new(sync.WaitGroup) - wg.Add(1) - s.nameLocks[name] = wg - return nil, nil -} - -// Unlock unlocks name. -func (s *syncLock) Unlock(name string) error { - s.nameLocksMu.Lock() - defer s.nameLocksMu.Unlock() - wg, ok := s.nameLocks[name] - if !ok { - return fmt.Errorf("FileStorage: no lock to release for %s", name) - } - wg.Done() - delete(s.nameLocks, name) - return nil -} diff --git a/plugins.go b/plugins.go index f7d14f86..ba111403 100644 --- a/plugins.go +++ b/plugins.go @@ -383,6 +383,14 @@ func loadCaddyfileInput(serverType string) (Input, error) { return caddyfileToUse, nil } +// OnProcessExit is a list of functions to run when the process +// exits -- they are ONLY for cleanup and should not block, +// return errors, or do anything fancy. They will be run with +// every signal, even if "shutdown callbacks" are not executed. +// This variable must only be modified in the main goroutine +// from init() functions. +var OnProcessExit []func() + // caddyfileLoader pairs the name of a loader to the loader. type caddyfileLoader struct { name string diff --git a/sigtrap.go b/sigtrap.go index a10cf0f0..ac61c59c 100644 --- a/sigtrap.go +++ b/sigtrap.go @@ -44,16 +44,17 @@ func trapSignalsCrossPlatform() { if i > 0 { log.Println("[INFO] SIGINT: Force quit") - if PidFile != "" { - os.Remove(PidFile) + for _, f := range OnProcessExit { + f() // important cleanup actions only } os.Exit(2) } log.Println("[INFO] SIGINT: Shutting down") - if PidFile != "" { - os.Remove(PidFile) + // important cleanup actions before shutdown callbacks + for _, f := range OnProcessExit { + f() } go func() { diff --git a/sigtrap_posix.go b/sigtrap_posix.go index 71b6969a..38aaa774 100644 --- a/sigtrap_posix.go +++ b/sigtrap_posix.go @@ -33,8 +33,8 @@ func trapSignalsPosix() { switch sig { case syscall.SIGTERM: log.Println("[INFO] SIGTERM: Terminating process") - if PidFile != "" { - os.Remove(PidFile) + for _, f := range OnProcessExit { + f() // only perform important cleanup actions } os.Exit(0) @@ -46,8 +46,8 @@ func trapSignalsPosix() { log.Printf("[ERROR] SIGQUIT stop: %v", err) exitCode = 3 } - if PidFile != "" { - os.Remove(PidFile) + for _, f := range OnProcessExit { + f() // only perform important cleanup actions } os.Exit(exitCode)