0
Fork 0
mirror of https://github.com/TryGhost/Ghost.git synced 2025-04-08 02:52:39 -05:00

Cleaned up obsolete Tinybird pipes and datasources (#22724)

ref
https://linear.app/ghost/issue/ANAL-186/clean-up-pipes-and-data-sources-that-are-no-longer-relevant

- We currently have a lot of cruft in our Tinybird workspaces, with
obsolete versions of the current live APIs, and some completely unused
pipes and materialized views that have stuck around for one reason or
another.
- This commit adds a custom deploy script which cleans up all these
unused pipes and datasources, and also deletes the unused files from the
repo so we can iterate from a clean start going forward.
This commit is contained in:
Chris Raible 2025-03-31 13:37:04 -07:00 committed by GitHub
parent a2c3d9c570
commit 2dbc324e35
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 71 additions and 515 deletions

View file

@ -3,7 +3,7 @@
# bump post to deploy to the current live Release, rollback to previous post version is not available
# bump patch or minor to deploy a new Release and auto-promote it to live. Add TB_AUTO_PROMOTE=0 to create the Release in preview status
# bump major to deploy a new Release in preview status
VERSION=0.0.0
VERSION=0.0.1

View file

@ -1,20 +0,0 @@
VERSION 3
SCHEMA >
`site_uuid` String,
`post_uuid` String,
`post_type` String,
`date` Date,
`device` String,
`os` String,
`browser` String,
`location` String,
`source` String,
`pathname` String,
`member_status` SimpleAggregateFunction(max, String),
`visits` AggregateFunction(uniq, String),
`pageviews` AggregateFunction(count)
ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "site_uuid, date, device, os, browser, location, source, pathname, post_type, post_uuid"

View file

@ -1,16 +0,0 @@
VERSION 7
# Data Source created from Pipe 'mv_session_data__v7'
SCHEMA >
`site_uuid` LowCardinality(String),
`session_id` String,
`pageviews` UInt64,
`first_pageview` DateTime,
`last_pageview` DateTime,
`duration` Int32,
`is_bounce` UInt8,
`source` String
ENGINE "ReplacingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(first_pageview)"
ENGINE_SORTING_KEY "site_uuid, session_id"

View file

@ -1,22 +0,0 @@
VERSION 3
SCHEMA >
`site_uuid` String,
`date` Date,
`session_id` String,
`member_status` SimpleAggregateFunction(max, String),
`post_uuid` SimpleAggregateFunction(any, String),
`post_type` SimpleAggregateFunction(any, String),
`device` SimpleAggregateFunction(any, String),
`os` SimpleAggregateFunction(any, String),
`browser` SimpleAggregateFunction(any, String),
`location` SimpleAggregateFunction(any, String),
`source` SimpleAggregateFunction(any, String),
`pathname` SimpleAggregateFunction(any, String),
`first_view` SimpleAggregateFunction(min, DateTime),
`latest_view` SimpleAggregateFunction(max, DateTime),
`pageviews` AggregateFunction(count)
ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "site_uuid, date, session_id"

View file

@ -1,21 +0,0 @@
VERSION 3
SCHEMA >
`site_uuid` String,
`date` Date,
`session_id` String,
`device` String,
`os` String,
`browser` String,
`location` String,
`source` String,
`pathname` String,
`member_status` SimpleAggregateFunction(max, String),
`visits` AggregateFunction(uniq, String),
`pageviews` AggregateFunction(count),
`post_type` SimpleAggregateFunction(any, String),
`post_uuid` SimpleAggregateFunction(any, String)
ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "site_uuid, date, session_id, device, os, browser, location, source, pathname"

View file

@ -0,0 +1,70 @@
#!/bin/bash
# Exit on error
set -e
# Ensure we're authenticated
if ! tb workspace ls &>/dev/null; then
echo "❌ Not authenticated with Tinybird. Please run 'tb auth' first"
exit 1
fi
# Function to check if a command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Check required dependencies
for cmd in tb jq git; do
if ! command_exists "$cmd"; then
echo "❌ Required command '$cmd' not found"
exit 1
fi
done
# Display current workspace and branch
echo "🔍 Current context:"
echo " Workspace: $(tb workspace ls | grep "True" | head -n1 | awk -F'|' '{print $2}' | xargs)"
echo " Current branch:"
tb branch current
# Delete all pipes that have a version less than 7 or no version
echo "🗑️ Deleting pipes with version < 7 or no version..."
tb pipe ls --format json | \
jq -r '.pipes[] | select(.version < 7 or .version == "" or (.name | endswith("_dup"))) | if .version == "" then .name else "\(.name)__v\(.version)" end' | \
while read -r pipe_id; do
if [ ! -z "$pipe_id" ]; then
echo "Deleting pipe: $pipe_id"
tb pipe rm "$pipe_id" --yes
fi
done
# Delete all datasources except analytics_events and _mv_hits__v7
echo "🗑️ Deleting non-essential datasources..."
tb datasource ls --format json | \
jq -r '.datasources[] | select(.name != "analytics_events" and .name + "__v" + (.version|tostring) != "_mv_hits__v7") | "\(.name)__v\(.version)"' | \
while read -r ds_id; do
if [ ! -z "$ds_id" ]; then
echo "Deleting datasource: $ds_id"
tb datasource rm "$ds_id" --yes
fi
done
echo "✅ Deployment completed successfully!"
echo "Current pipes after cleanup:"
tb pipe ls
echo "Current datasources after cleanup:"
tb datasource ls
echo "Updating the git commit metadata..."
# Only update git commit metadata if we're on the main branch
CURRENT_BRANCH=$(tb branch current | grep "|" | awk -F'|' '{print $2}' | xargs)
if [ "$CURRENT_BRANCH" = "main" ]; then
CURRENT_COMMIT=$(git rev-parse HEAD)
echo "On main branch, updating git commit metadata to: $CURRENT_COMMIT"
tb init --override-commit "$CURRENT_COMMIT"
else
echo "Not on main branch ($CURRENT_BRANCH), skipping git commit metadata update"
fi

View file

@ -1,8 +0,0 @@
# some comments
# tinybird was seeing this file as renamed...
# so we're just going to add a comment to the file
# to make it happy
# and then we'll delete the file
# and then we'll delete the comment
# and then we'll delete the file
# and then we'll delete the comment

View file

@ -1,61 +0,0 @@
NODE filtered_sessions
DESCRIPTION >
Get sessions that match the filter criteria
SQL >
%
select distinct session_id
from mv_hits
where
site_uuid = {{ String(site_uuid, 'mock_site_uuid', description="Tenant ID", required=True) }}
{% if defined(date_from) %} and toDate(toTimezone(timestamp, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) >= {{ Date(date_from) }} {% else %} and toDate(toTimezone(timestamp, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) >= timestampAdd(today(), interval -7 day) {% end %}
{% if defined(date_to) %} and toDate(toTimezone(timestamp, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) <= {{ Date(date_to) }} {% else %} and toDate(toTimezone(timestamp, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) <= today() {% end %}
{% if defined(member_status) %} and member_status IN {{ Array(member_status, "'undefined', 'free', 'paid'", description="Member status to filter on", required=False) }} {% end %}
{% if defined(device) %} and device = {{ String(device, description="Device to filter on", required=False) }} {% end %}
{% if defined(browser) %} and browser = {{ String(browser, description="Browser to filter on", required=False) }} {% end %}
{% if defined(os) %} and os = {{ String(os, description="Operating system to filter on", required=False) }} {% end %}
{% if defined(source) %} and source = {{ String(source, description="Source to filter on", required=False) }} {% end %}
{% if defined(location) %} and location = {{ String(location, description="Location to filter on", required=False) }} {% end %}
{% if defined(pathname) %} and pathname = {{ String(pathname, description="Pathname to filter on", required=False) }} {% end %}
NODE source_sessions
SQL >
%
select session_id
from _mv_session_data sd
inner join filtered_sessions fs
on fs.session_id = sd.session_id
where
site_uuid = {{ String(site_uuid, 'mock_site_uuid', description="Tenant ID", required=True) }}
{% if defined(source) %} and source = {{ String(source, description="Source to filter on", required=False) }} {% end %}
NODE api_top_sources_2
SQL >
%
select
source,
count() as visits,
sum(pageviews) as pageviews
from _mv_session_data sd
inner join source_sessions ss
on ss.session_id = sd.session_id
where
site_uuid = {{ String(site_uuid, 'mock_site_uuid', description="Tenant ID", required=True) }}
{% if defined(date_from) and day_diff(date_from, date_to) == 0 %}
and toDate(toTimezone(first_pageview, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) = {{ Date(date_from) }}
{% else %}
{% if defined(date_from) %} and toDate(toTimezone(first_pageview, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) >= {{ Date(date_from) }} {% else %} and toDate(toTimezone(first_pageview, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) >= timestampAdd(today(), interval -7 day) {% end %}
{% if defined(date_to) %} and toDate(toTimezone(first_pageview, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) <= {{ Date(date_to) }} {% else %} and toDate(toTimezone(first_pageview, {{String(timezone, 'Etc/UTC', description="Site timezone", required=True)}})) <= today() {% end %}
{% end %}
group by source
order by visits desc
limit {{ Int32(skip, 0) }},{{ Int32(limit, 50) }}

View file

@ -1,117 +0,0 @@
VERSION 3
TOKEN "stats_page" READ
NODE parsed_hits
SQL >
SELECT
timestamp,
action,
version,
coalesce(session_id, '0') as session_id,
toString(payload.locale) as locale,
toString(payload.location) as location,
toString(payload.referrer) as referrer,
toString(payload.pathname) as pathname,
toString(payload.href) as href,
site_uuid,
toString(payload.member_uuid) as member_uuid,
toString(payload.member_status) as member_status,
toString(payload.post_uuid) as post_uuid,
toString(payload.post_type) as post_type,
lower(toString(getSubcolumn(payload,'user-agent'))) as user_agent
FROM analytics_events
where action = 'page_hit'
NODE hits
SQL >
SELECT
site_uuid,
timestamp,
action,
version,
session_id,
member_uuid,
member_status,
post_uuid,
post_type,
location,
domainWithoutWWW(referrer) as source,
pathname,
href,
case
when match(user_agent, 'wget|ahrefsbot|curl|urllib|bitdiscovery|\+https://|googlebot')
then 'bot'
when match(user_agent, 'android')
then 'mobile-android'
when match(user_agent, 'ipad|iphone|ipod')
then 'mobile-ios'
else 'desktop'
END as device,
case
when match(user_agent, 'windows')
then 'windows'
when match(user_agent, 'mac')
then 'macos'
when match(user_agent, 'linux')
then 'linux'
when match(user_agent, 'android')
then 'android'
when match(user_agent, 'iphone|ipad|ipod')
then 'ios'
else 'Unknown'
END as os,
case
when match(user_agent, 'firefox')
then 'firefox'
when match(user_agent, 'chrome|crios')
then 'chrome'
when match(user_agent, 'opera')
then 'opera'
when match(user_agent, 'msie|trident')
then 'ie'
when match(user_agent, 'iphone|ipad|safari')
then 'safari'
else 'Unknown'
END as browser
FROM parsed_hits
NODE _pages_0
SQL >
SELECT
site_uuid,
toDate(timestamp) AS date,
post_type,
post_uuid,
device,
os,
browser,
location,
source,
pathname,
maxSimpleState(member_status) AS member_status,
uniqState(session_id) AS visits,
countState() AS pageviews
FROM hits
GROUP BY
site_uuid,
date,
device,
os,
browser,
location,
source,
pathname,
post_type,
post_uuid
TYPE materialized
DATASOURCE _mv_pages

View file

@ -1,112 +0,0 @@
VERSION 3
TOKEN "stats_page" READ
NODE parsed_hits
SQL >
SELECT
timestamp,
action,
version,
coalesce(session_id, '0') as session_id,
toString(payload.locale) as locale,
toString(payload.location) as location,
toString(payload.referrer) as referrer,
toString(payload.pathname) as pathname,
toString(payload.href) as href,
site_uuid,
toString(payload.member_uuid) as member_uuid,
toString(payload.member_status) as member_status,
toString(payload.post_uuid) as post_uuid,
toString(payload.post_type) as post_type,
lower(toString(getSubcolumn(payload,'user-agent'))) as user_agent
FROM analytics_events
where action = 'page_hit'
NODE hits
SQL >
SELECT
site_uuid,
timestamp,
action,
version,
session_id,
member_uuid,
member_status,
post_uuid,
post_type,
location,
domainWithoutWWW(referrer) as source,
pathname,
href,
case
when match(user_agent, 'wget|ahrefsbot|curl|urllib|bitdiscovery|\+https://|googlebot')
then 'bot'
when match(user_agent, 'android')
then 'mobile-android'
when match(user_agent, 'ipad|iphone|ipod')
then 'mobile-ios'
else 'desktop'
END as device,
case
when match(user_agent, 'windows')
then 'windows'
when match(user_agent, 'mac')
then 'macos'
when match(user_agent, 'linux')
then 'linux'
when match(user_agent, 'android')
then 'android'
when match(user_agent, 'iphone|ipad|ipod')
then 'ios'
else 'Unknown'
END as os,
case
when match(user_agent, 'firefox')
then 'firefox'
when match(user_agent, 'chrome|crios')
then 'chrome'
when match(user_agent, 'opera')
then 'opera'
when match(user_agent, 'msie|trident')
then 'ie'
when match(user_agent, 'iphone|ipad|safari')
then 'safari'
else 'Unknown'
END as browser
FROM parsed_hits
NODE _sessions_0
SQL >
SELECT
site_uuid,
toDate(timestamp) AS date,
session_id,
maxSimpleState(member_status) AS member_status,
anySimpleState(post_uuid) AS post_uuid,
anySimpleState(post_type) AS post_type,
anySimpleState(device) AS device,
anySimpleState(os) AS os,
anySimpleState(browser) AS browser,
anySimpleState(location) AS location,
anySimpleState(source) AS source,
anySimpleState(pathname) AS pathname,
minSimpleState(timestamp) AS first_view,
maxSimpleState(timestamp) AS latest_view,
countState() AS pageviews
FROM hits
GROUP BY
site_uuid,
date,
session_id
TYPE materialized
DATASOURCE _mv_sessions

View file

@ -1,137 +0,0 @@
VERSION 3
TOKEN "stats_page" READ
NODE parsed_hits
SQL >
SELECT
timestamp,
action,
version,
coalesce(session_id, '0') as session_id,
toString(payload.locale) as locale,
toString(payload.location) as location,
toString(payload.referrer) as referrer,
toString(payload.pathname) as pathname,
toString(payload.href) as href,
site_uuid,
toString(payload.member_uuid) as member_uuid,
toString(payload.member_status) as member_status,
toString(payload.post_uuid) as post_uuid,
toString(payload.post_type) as post_type,
lower(toString(getSubcolumn(payload,'user-agent'))) as user_agent
FROM analytics_events
where action = 'page_hit'
NODE hits
SQL >
SELECT
site_uuid,
timestamp,
action,
version,
session_id,
member_uuid,
member_status,
post_uuid,
post_type,
location,
domainWithoutWWW(referrer) as source,
pathname,
href,
case
when match(user_agent, 'wget|ahrefsbot|curl|urllib|bitdiscovery|\+https://|googlebot')
then 'bot'
when match(user_agent, 'android')
then 'mobile-android'
when match(user_agent, 'ipad|iphone|ipod')
then 'mobile-ios'
else 'desktop'
END as device,
case
when match(user_agent, 'windows')
then 'windows'
when match(user_agent, 'mac')
then 'macos'
when match(user_agent, 'linux')
then 'linux'
when match(user_agent, 'android')
then 'android'
when match(user_agent, 'iphone|ipad|ipod')
then 'ios'
else 'Unknown'
END as os,
case
when match(user_agent, 'firefox')
then 'firefox'
when match(user_agent, 'chrome|crios')
then 'chrome'
when match(user_agent, 'opera')
then 'opera'
when match(user_agent, 'msie|trident')
then 'ie'
when match(user_agent, 'iphone|ipad|safari')
then 'safari'
else 'Unknown'
END as browser
FROM parsed_hits
NODE _sources_0
SQL >
WITH
(
SELECT domainWithoutWWW(href)
FROM hits
LIMIT 1
) AS current_domain,
sessions AS
(
SELECT
session_id,
argMin(source, timestamp) AS source,
maxSimpleState(member_status) AS member_status
FROM hits
GROUP BY session_id
)
SELECT
a.site_uuid,
toDate(a.timestamp) AS date,
a.session_id,
a.device,
a.os,
a.browser,
a.location,
b.source AS source,
a.pathname,
a.post_type,
a.post_uuid,
b.member_status AS member_status,
uniqState(a.session_id) AS visits,
countState() AS pageviews
FROM hits AS a
INNER JOIN sessions AS b ON a.session_id = b.session_id
GROUP BY
a.site_uuid,
toDate(a.timestamp),
a.session_id,
a.device,
a.os,
a.browser,
a.location,
b.member_status,
b.source,
a.pathname,
a.post_type,
a.post_uuid
HAVING b.source != current_domain
TYPE materialized
DATASOURCE _mv_sources