2022-03-23 10:59:20 +01:00
;; This Source Code Form is subject to the terms of the Mozilla Public
;; License, v. 2.0. If a copy of the MPL was not distributed with this
;; file, You can obtain one at http://mozilla.org/MPL/2.0/.
2022-09-20 23:23:22 +02:00
;; Copyright (c) KALEIDOS INC
2022-03-23 10:59:20 +01:00
(ns app.tasks.file-gc
"A maintenance task that is responsible of: purge unused file media,
2022-03-30 00:11:43 +02:00
clean unused object thumbnails and remove old file thumbnails. The
2022-03-23 10:59:20 +01:00
file is eligible to be garbage collected after some period of
inactivity (the default threshold is 72h)."
[app.common.data :as d]
[app.common.logging :as l]
[app.common.pages.migrations :as pmg]
2022-09-23 10:20:20 +02:00
[app.common.types.file :as ctf]
2022-06-28 11:05:45 +02:00
[app.common.types.shape-tree :as ctt]
2022-08-11 16:59:57 +02:00
[app.config :as cf]
2022-03-23 10:59:20 +01:00
[app.db :as db]
2022-11-01 09:46:54 +01:00
[app.rpc.commands.files :as files]
2022-03-23 10:59:20 +01:00
[app.util.blob :as blob]
2022-11-01 09:46:54 +01:00
[app.util.pointer-map :as pmap]
2022-03-23 10:59:20 +01:00
[app.util.time :as dt]
2022-03-30 00:11:43 +02:00
[clojure.set :as set]
2022-03-23 10:59:20 +01:00
[clojure.spec.alpha :as s]
[integrant.core :as ig]))
(declare ^:private retrieve-candidates)
(declare ^:private process-file)
2022-08-11 16:59:57 +02:00
(s/def ::min-age ::dt/duration)
2022-03-23 10:59:20 +01:00
(defmethod ig/pre-init-spec ::handler [_]
2022-08-11 16:59:57 +02:00
(s/keys :req-un [::db/pool ::min-age]))
(defmethod ig/prep-key ::handler
[_ cfg]
(merge {:min-age cf/deletion-delay}
(d/without-nils cfg)))
2022-03-23 10:59:20 +01:00
(defmethod ig/init-key ::handler
[_ {:keys [pool] :as cfg}]
2022-11-10 09:43:13 +01:00
(fn [{:keys [file-id] :as params}]
2022-03-23 10:59:20 +01:00
(db/with-atomic [conn pool]
2022-08-12 08:34:23 +02:00
(let [min-age (or (:min-age params) (:min-age cfg))
2022-11-10 09:43:13 +01:00
cfg (assoc cfg :min-age min-age :conn conn :file-id file-id)]
2022-03-23 10:59:20 +01:00
(loop [total 0
files (retrieve-candidates cfg)]
(if-let [file (first files)]
(process-file cfg file)
(recur (inc total)
(rest files)))
2022-11-01 09:46:54 +01:00
(l/info :hint "task finished" :min-age (dt/format-duration min-age) :processed total)
2022-08-11 16:59:57 +02:00
2022-08-12 08:34:23 +02:00
;; Allow optional rollback passed by params
(when (:rollback? params)
(db/rollback! conn))
2022-08-11 16:59:57 +02:00
2022-03-23 10:59:20 +01:00
{:processed total})))))))
(def ^:private
"select f.id,
2022-11-01 09:46:54 +01:00
2022-03-23 10:59:20 +01:00
from file as f
where f.has_media_trimmed is false
and f.modified_at < now() - ?::interval
and f.modified_at < ?
order by f.modified_at desc
limit 1
for update skip locked")
(defn- retrieve-candidates
2022-11-10 09:43:13 +01:00
[{:keys [conn min-age file-id] :as cfg}]
(if (uuid? file-id)
2022-08-11 16:59:57 +02:00
2022-11-10 09:43:13 +01:00
(l/warn :hint "explicit file id passed on params" :file-id file-id)
(->> (db/query conn :file {:id file-id})
2022-11-01 09:46:54 +01:00
(map #(update % :features db/decode-pgarray #{}))))
2022-08-11 16:59:57 +02:00
(let [interval (db/interval min-age)
get-chunk (fn [cursor]
(let [rows (db/exec! conn [sql:retrieve-candidates-chunk interval cursor])]
2022-11-01 09:46:54 +01:00
[(some->> rows peek :modified-at)
(map #(update % :features db/decode-pgarray #{}) rows)]))]
2022-09-28 23:26:31 +02:00
(d/iteration get-chunk
:vf second
:kf first
:initk (dt/now)))))
2022-03-23 10:59:20 +01:00
2022-06-22 11:39:57 +02:00
(defn collect-used-media
2022-11-01 09:46:54 +01:00
"Analyzes the file data and collects all references to external
assets. Returns a set of ids."
2022-03-23 10:59:20 +01:00
(let [xform (comp
(map :objects)
(mapcat vals)
(keep (fn [{:keys [type] :as obj}]
(case type
:path (get-in obj [:fill-image :id])
:image (get-in obj [:metadata :id])
pages (concat
(vals (:pages-index data))
(vals (:components data)))]
(-> #{}
(into xform pages)
(into (keys (:media data))))))
(defn- clean-file-media!
"Performs the garbage collection of file media objects."
[conn file-id data]
(let [used (collect-used-media data)
unused (->> (db/query conn :file-media-object {:file-id file-id})
(remove #(contains? used (:id %))))]
(doseq [mobj unused]
(l/debug :hint "delete file media object"
:id (:id mobj)
:media-id (:media-id mobj)
:thumbnail-id (:thumbnail-id mobj))
;; NOTE: deleting the file-media-object in the database
;; automatically marks as touched the referenced storage
;; objects. The touch mechanism is needed because many files can
;; point to the same storage objects and we can't just delete
;; them.
(db/delete! conn :file-media-object {:id (:id mobj)}))))
(defn- clean-file-frame-thumbnails!
[conn file-id data]
2022-03-30 00:11:43 +02:00
(let [stored (->> (db/query conn :file-object-thumbnail
{:file-id file-id}
{:columns [:object-id]})
(into #{} (map :object-id)))
2022-05-09 12:35:34 +02:00
(fn [{:keys [id objects]}]
2022-06-28 11:05:45 +02:00
(->> (ctt/get-frames objects)
2022-05-09 12:35:34 +02:00
(map #(str id (:id %)))))
using (into #{}
(mapcat get-objects-ids)
(vals (:pages-index data)))
2022-03-30 00:11:43 +02:00
unused (set/difference stored using)]
(when (seq unused)
2022-11-01 09:46:54 +01:00
(let [sql (str "delete from file_object_thumbnail "
" where file_id=? and object_id=ANY(?)")
2022-05-09 12:35:34 +02:00
res (db/exec-one! conn [sql file-id (db/create-array conn "text" unused)])]
2022-08-11 16:24:23 +02:00
(l/debug :hint "delete file object thumbnails" :file-id file-id :total (:next.jdbc/update-count res))))))
2022-03-23 10:59:20 +01:00
(defn- clean-file-thumbnails!
[conn file-id revn]
(let [sql (str "delete from file_thumbnail "
" where file_id=? and revn < ?")
res (db/exec-one! conn [sql file-id revn])]
2022-09-23 10:20:20 +02:00
(when-not (zero? (:next.jdbc/update-count res))
(l/debug :hint "delete file thumbnails" :file-id file-id :total (:next.jdbc/update-count res)))))
(def ^:private
"select f.data, f.modified_at
from file as f
left join file_library_rel as fl on (fl.file_id = f.id)
where fl.library_file_id = ?
and f.modified_at < ?
and f.deleted_at is null
order by f.modified_at desc
limit 1")
(defn- retrieve-client-files
"search al files that use the given library.
Returns a sequence of file-data (only reads database rows one by one)."
[conn library-id]
(let [get-chunk (fn [cursor]
(let [rows (db/exec! conn [sql:retrieve-client-files library-id cursor])]
[(some-> rows peek :modified-at)
2022-10-04 13:49:54 +02:00
(map (comp blob/decode :data) rows)]))]
2022-09-23 10:20:20 +02:00
(d/iteration get-chunk
:vf second
:kf first
:initk (dt/now))))
(defn- clean-deleted-components!
"Performs the garbage collection of unreferenced deleted components."
[conn library-id library-data]
(let [find-used-components-file
(fn [components file-data]
2022-10-04 13:49:54 +02:00
; Find which of the components are used in the file.
(into #{}
(filter #(ctf/used-in? file-data library-id % :component))
2022-09-23 10:20:20 +02:00
(fn [components files-data]
; Find what components are used in any of the files.
(loop [files-data files-data
components components
2022-10-04 13:49:54 +02:00
used-components #{}]
2022-09-23 10:20:20 +02:00
(let [file-data (first files-data)]
(if (or (nil? file-data) (empty? components))
(let [used-components-file (find-used-components-file components file-data)]
(recur (rest files-data)
2022-10-04 13:49:54 +02:00
(into #{} (remove used-components-file) components)
2022-09-23 10:20:20 +02:00
(into used-components used-components-file)))))))
2022-10-04 13:49:54 +02:00
deleted-components (set (vals (:deleted-components library-data)))
saved-components (find-used-components deleted-components
(cons library-data
(retrieve-client-files conn library-id)))
new-deleted-components (d/index-by :id (vec saved-components))
2022-09-23 10:20:20 +02:00
total (- (count deleted-components)
(count saved-components))]
(when-not (zero? total)
(l/debug :hint "clean deleted components" :total total)
(let [new-data (-> library-data
2022-10-04 13:49:54 +02:00
(assoc :deleted-components new-deleted-components)
2022-09-23 10:20:20 +02:00
(db/update! conn :file
{:data new-data}
{:id library-id})))))
2022-03-23 10:59:20 +01:00
2022-11-01 09:46:54 +01:00
(def ^:private sql:get-unused-fragments
"SELECT id FROM file_data_fragment
WHERE file_id = ? AND id != ALL(?::uuid[])")
(defn- clean-data-fragments!
[conn file-id data]
(let [used (->> (concat (vals data)
(vals (:pages-index data)))
(into #{} (comp (filter pmap/pointer-map?)
(map pmap/get-id)))
(db/create-array conn "uuid"))
rows (db/exec! conn [sql:get-unused-fragments file-id used])]
(doseq [fragment-id (map :id rows)]
(l/trace :hint "remove unused file data fragment" :id (str fragment-id))
(db/delete! conn :file-data-fragment {:id fragment-id :file-id file-id}))))
2022-03-23 10:59:20 +01:00
(defn- process-file
2022-11-01 09:46:54 +01:00
[{:keys [conn] :as cfg} {:keys [id data revn modified-at features] :as file}]
2022-03-23 10:59:20 +01:00
(l/debug :hint "processing file" :id id :modified-at modified-at)
2022-11-01 09:46:54 +01:00
(binding [pmap/*load-fn* (partial files/load-pointer conn id)]
(let [data (-> (blob/decode data)
(assoc :id id)
(clean-file-media! conn id data)
(clean-file-frame-thumbnails! conn id data)
(clean-file-thumbnails! conn id revn)
(clean-deleted-components! conn id data)
2022-03-23 10:59:20 +01:00
2022-11-01 09:46:54 +01:00
(when (contains? features "storage/pointer-map")
(clean-data-fragments! conn id data))
2022-03-23 10:59:20 +01:00
2022-11-01 09:46:54 +01:00
;; Mark file as trimmed
(db/update! conn :file
2022-03-23 10:59:20 +01:00
{:has-media-trimmed true}
{:id id})
2022-11-01 09:46:54 +01:00