From 9160789b421ae1d28c813c8ed3297cedd469722a Mon Sep 17 00:00:00 2001
From: Matthew Holt <mholt@users.noreply.github.com>
Date: Thu, 10 May 2018 08:57:25 -0600
Subject: [PATCH] telemetry: Make http_user_agent a normalized field

This way we store a short 8-byte hash of the UA instead of the full
string; exactly the same way we store TLS ClientHello info.
---
 caddyhttp/httpserver/mitm.go   |  3 ++-
 caddyhttp/httpserver/server.go |  4 +++-
 caddytls/handshake.go          |  2 +-
 telemetry/collection.go        | 11 +++++++++++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/caddyhttp/httpserver/mitm.go b/caddyhttp/httpserver/mitm.go
index 6744a924e..d2faf5f3f 100644
--- a/caddyhttp/httpserver/mitm.go
+++ b/caddyhttp/httpserver/mitm.go
@@ -65,9 +65,10 @@ func (h *tlsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	h.listener.helloInfosMu.RUnlock()
 
 	ua := r.Header.Get("User-Agent")
+	uaHash := telemetry.FastHash([]byte(ua))
 
 	// report this request's UA in connection with this ClientHello
-	go telemetry.AppendUnique("tls_client_hello_ua:"+caddytls.ClientHelloInfo(info).Key(), ua)
+	go telemetry.AppendUnique("tls_client_hello_ua:"+caddytls.ClientHelloInfo(info).Key(), uaHash)
 
 	var checked, mitm bool
 	if r.Header.Get("X-BlueCoat-Via") != "" || // Blue Coat (masks User-Agent header to generic values)
diff --git a/caddyhttp/httpserver/server.go b/caddyhttp/httpserver/server.go
index 45f1c639d..800f921de 100644
--- a/caddyhttp/httpserver/server.go
+++ b/caddyhttp/httpserver/server.go
@@ -354,7 +354,9 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	if len(ua) > 512 {
 		ua = ua[:512]
 	}
-	go telemetry.AppendUnique("http_user_agent", ua)
+	uaHash := telemetry.FastHash([]byte(ua)) // this is a normalized field
+	go telemetry.SetNested("http_user_agent", uaHash, ua)
+	go telemetry.AppendUnique("http_user_agent_count", uaHash)
 	go telemetry.Increment("http_request_count")
 
 	// copy the original, unchanged URL into the context
diff --git a/caddytls/handshake.go b/caddytls/handshake.go
index 8b2639845..7e2ccb95e 100644
--- a/caddytls/handshake.go
+++ b/caddytls/handshake.go
@@ -516,7 +516,7 @@ func (info ClientHelloInfo) Key() string {
 	if !info.CompressionMethodsUnknown {
 		compressionMethods = fmt.Sprintf("%x", info.CompressionMethods)
 	}
-	return fastHash([]byte(fmt.Sprintf("%x-%x-%s-%s-%x-%x",
+	return telemetry.FastHash([]byte(fmt.Sprintf("%x-%x-%s-%s-%x-%x",
 		info.Version, info.CipherSuites, extensions,
 		compressionMethods, info.Curves, info.Points)))
 }
diff --git a/telemetry/collection.go b/telemetry/collection.go
index a46e2caf1..07cf0dc9c 100644
--- a/telemetry/collection.go
+++ b/telemetry/collection.go
@@ -15,6 +15,8 @@
 package telemetry
 
 import (
+	"fmt"
+	"hash/fnv"
 	"log"
 	"strings"
 
@@ -276,6 +278,15 @@ func atomicAdd(key string, amount int) {
 	bufferMu.Unlock()
 }
 
+// FastHash hashes input using a 32-bit hashing algorithm
+// that is fast, and returns the hash as a hex-encoded string.
+// Do not use this for cryptographic purposes.
+func FastHash(input []byte) string {
+	h := fnv.New32a()
+	h.Write(input)
+	return fmt.Sprintf("%x", h.Sum32())
+}
+
 // isDisabled returns whether key is
 // a disabled metric key. ALL collection
 // functions should call this and not