diff --git a/infra/copycat-db/.github/workflows/ci.yaml b/infra/copycat-db/.github/workflows/ci.yaml new file mode 100644 index 000000000..aa57e206c --- /dev/null +++ b/infra/copycat-db/.github/workflows/ci.yaml @@ -0,0 +1,29 @@ +name: Build and push Docker image + +on: + # Enable manual run + workflow_dispatch: + push: + branches: + - release + # Sequence of patterns matched against refs/tags + tags: + - "v*" # Push events to matching v*, i.e. v4.2.0 + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + name: Check out code + + - uses: mr-smithers-excellent/docker-build-push@v6 + name: Build & Push + with: + image: ente/copycat-db + registry: rg.fr-par.scw.cloud + tags: ${GITHUB_SHA}, latest + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + env: + GIT_COMMIT: ${GITHUB_SHA} diff --git a/infra/copycat-db/.gitignore b/infra/copycat-db/.gitignore new file mode 100644 index 000000000..75236ad1c --- /dev/null +++ b/infra/copycat-db/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +copycat-db.env diff --git a/infra/copycat-db/Dockerfile b/infra/copycat-db/Dockerfile new file mode 100644 index 000000000..4328ef17f --- /dev/null +++ b/infra/copycat-db/Dockerfile @@ -0,0 +1,34 @@ +FROM ubuntu:latest + +RUN apt-get update && apt-get install -y curl gnupg +RUN apt-get install -y tini + +# Install pg_dump (via Postgres client) +# https://www.postgresql.org/download/linux/ubuntu/ +# +# We don't need it for production backups, but this is useful for local testing. +RUN \ + apt-get install -y lsb-release && \ + sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \ + curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ + apt-get update && \ + apt-get -y install postgresql-client-12 + +# Install SCW CLI +# Latest release: https://github.com/scaleway/scaleway-cli/releases/latest +RUN \ + export VERSION="2.26.0" && \ + curl -o /usr/local/bin/scw -L "https://github.com/scaleway/scaleway-cli/releases/download/v${VERSION}/scaleway-cli_${VERSION}_linux_amd64" && \ + chmod +x /usr/local/bin/scw + +RUN apt-get install -y jq + +# Install rclone +RUN apt-get install -y unzip +RUN curl https://rclone.org/install.sh | bash + +COPY src / + +ENTRYPOINT ["tini", "--"] + +CMD [ "/backup.sh" ] diff --git a/infra/copycat-db/README.md b/infra/copycat-db/README.md new file mode 100644 index 000000000..95ca44ea4 --- /dev/null +++ b/infra/copycat-db/README.md @@ -0,0 +1,144 @@ +## Introduction + +Copycat DB is a [service](https://github.com/ente-io/infra) to take a backup of +our database. It uses the Scaleway CLI to take backups of the database, and +uploads them to an offsite bucket. + +This bucket has an object lock configured, so backups cannot be deleted before +expiry. Conversely, the service also deletes backups older than some threshold +when it creates a new one to avoid indefinite retention. + +In production the service runs as a cron job, scheduled using a systemd timer. + +## Required environment variables + +##### SCW_CONFIG_PATH + +Path to the `config.yaml` used by Scaleway CLI. + +This contains the credentials and the default region to use when trying to +create and download the database dump. + +If needed, this config file can be generated by running the following commands +on a shell prompt in the container (using `./test.sh sh`) + + scw init + scw config dump + +##### SCW_RDB_INSTANCE_ID + +The UUID of the Scalway RDB instance that we wish to backup. If this is missing, +then the Docker image falls back to using `pg_dump` (as outlined next). + +##### PGUSER, PGPASSWORD, PGHOST + +Not needed in production when taking a backup (since we use the Scaleway CLI to +take backups in production). + +These are used when testing a backup using `pg_dump`, and when restoring backups. + +##### RCLONE_CONFIG + +Location of the config file, that contains the destination bucket where you want +to use to save the backups, and the credentials to to access it. + +Specifically, the config file contains two remotes: + +* The bucket itself, where data will be stored. + +* A "crypt" remote that wraps the bucket by applying client side encryption. + +The configuration file will contain (lightly) obfuscated versions of the +password, and as long as we have the configuration file we can continue using +rclone to download and decrypt the plaintext. Still, it is helpful to retain the +original password too separately so that the file can be recreated if needed. + +A config file can be generated using `./test.sh sh` + + rclone config + rclone config show + +When generating the config, we keep file (and directory) name encryption off. + +Note that rclone creates a backup of the config file, so Docker needs to have +write access to the directory where it is mounted. + +##### RCLONE_DESTINATION + +Name of the (crypt) remote to which the dump should be saved. Example: +`db-backup-crypt:`. + +Note that this will not include the bucket - the bucket name will be part of the +remote that the crypt remote wraps. + +##### Logging + +The service logs to its standard out/error. The systemd unit is configured to +route these to `/var/logs/copycat-db.log`. + +## Local testing + +The provided `test.sh` script can be used to do a smoke test for building and +running the image. For example, + + ./test.sh bin/bash + +gives us a shell prompt inside the built and running container. + +For more thorough testing, run this service as part of a local test-cluster. + +## Restoring + +The service also knows how to restore the latest backup into a Postgres +instance. This functionality is used to periodically verify that the backups are +restorable. + +To invoke this, use "./restore.sh" as the command when running the container +(e.g. `./test.sh ./restore.sh`). This will restore the latest backup into the +Postgres instance whose credentials are provided via the various `PG*` +environment variables. + +## Preparing the bucket + +The database dumps are stored in a bucket that has object lock enabled +(Compliance mode), and has a default bucket level retention time of 30 days. + +## Deploying + +Ensure that promtail is running, and is configured to scrape +`/root/var/logs/copycat-db.log`. + +Create that the config and log destination directories + + sudo mkdir -p /root/var/config/scw + sudo mkdir -p /root/var/config/rclone + sudo mkdir -p /root/var/logs + +Create the env, scw and rclone configuration files + + sudo tee /root/copycat-db.env + sudo tee /root/var/config/scw/copycat-db-config.yaml + sudo tee /root/var/config/rclone/copycat-db-rclone.conf + +Add the service definition, and start the service + + scp copycat-db.{service,timer} instance: + + sudo mv copycat-db.{service,timer} /etc/systemd/system + sudo systemctl daemon-reload + +To enable the cron job + + sudo systemctl enable --now copycat-db.timer + +The timer will trigger the service on the specified schedule. In addition, if +you wish to force the job to service immediately + + sudo systemctl start copycat-db.service + +## Updating + +To update, run the [Github action](.github/workflows/ci.yaml) to push the latest +image to our Docker Registry, then restart the systemd service on the instance + + sudo systemctl restart copycat-db diff --git a/infra/copycat-db/Runbook.md b/infra/copycat-db/Runbook.md new file mode 100644 index 000000000..fe8160c1d --- /dev/null +++ b/infra/copycat-db/Runbook.md @@ -0,0 +1,13 @@ + +### Service logs + +```bash +tail -f -n 100 /root/var/logs/copycat-db.log +``` + +### Backup timeout +If you are seeing time-out from scw while waiting for backup, usually just stopping the [service](.copycat-db.service) and letting the [daily timer](./copycat-db.timer) restart it later works + +```bash + sudo systemctl stop copycat-db.service +``` diff --git a/infra/copycat-db/copycat-db.sample.env b/infra/copycat-db/copycat-db.sample.env new file mode 100644 index 000000000..243e8aa5f --- /dev/null +++ b/infra/copycat-db/copycat-db.sample.env @@ -0,0 +1,8 @@ +SCW_CONFIG_PATH=/var/config/scw/copycat-db-config.yaml +SCW_RDB_INSTANCE_ID= +PGUSER= +PGPASSWORD= +PGHOST=host.docker.internal +PGPORT= +RCLONE_CONFIG=/var/config/rclone/copycat-db-rclone.conf +RCLONE_DESTINATION=db-backup-crypt: diff --git a/infra/copycat-db/copycat-db.service b/infra/copycat-db/copycat-db.service new file mode 100644 index 000000000..819baa73c --- /dev/null +++ b/infra/copycat-db/copycat-db.service @@ -0,0 +1,20 @@ +[Unit] +Documentation=https://github.com/ente-io/copycat-db +Requires=docker.service +After=docker.service + +[Service] +Restart=always +RestartSec=3600s +# Don't automatically restart if it fails more than 6 times in 24 hours. +StartLimitInterval=86400 +StartLimitBurst=6 +ExecStartPre=docker pull rg.fr-par.scw.cloud/ente/copycat-db +ExecStartPre=-docker stop copycat-db +ExecStartPre=-docker rm copycat-db +ExecStart=docker run --name copycat-db \ + --env-file /root/copycat-db.env \ + -v /root/var:/var \ + rg.fr-par.scw.cloud/ente/copycat-db +StandardOutput=append:/root/var/logs/copycat-db.log +StandardError=inherit diff --git a/infra/copycat-db/copycat-db.timer b/infra/copycat-db/copycat-db.timer new file mode 100644 index 000000000..c3f6e2e86 --- /dev/null +++ b/infra/copycat-db/copycat-db.timer @@ -0,0 +1,8 @@ +[Unit] +Description=Schedule copycat-db + +[Timer] +OnCalendar=Daily + +[Install] +WantedBy=timers.target diff --git a/infra/copycat-db/src/backup.sh b/infra/copycat-db/src/backup.sh new file mode 100755 index 000000000..f197f4b0f --- /dev/null +++ b/infra/copycat-db/src/backup.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -o errexit +set -o xtrace + +NOWS="$(date +%s)" +BACKUP_FILE="db-$NOWS.custom" + +# Scaleway backup names cannot contain dots +BACKUP_NAME="db-$NOWS-custom" + +# Calculate an expiry time 1 month from now +EXPIRYS="$(( 30 * 24 * 60 * 60 + $NOWS ))" + +# Convert it to the ISO 8601 format that SCW CLI understands +# Note that GNU date uses "-d" and an "@" to pass an epoch (macOS uses "-r"). +EXPIRY="$(date -Iseconds --utc --date "@$EXPIRYS")" + +if test -z "$SCW_RDB_INSTANCE_ID" +then + # A required SCW related environment variable hasn't been specified. This is + # expected when running the script locally for testing. Fallback to using + # pg_dump for creating the backup. + pg_dump -Fc ente_db > $BACKUP_FILE +else + # We need to export a backup first after creating it, before it can be + # downloaded. + # + # Further, our backups currently take longer than the default 20 minute + # timeout for the export set by Scaleway, and end up failing: + # + # {"error":"scaleway-sdk-go: waiting for database backup failed: timeout after 20m0s"} + # + # To avoid this we need to add a custom wait here ourselves instead of using + # the convenience `--wait` flag for the export command provided by Scaleway. + BACKUP_ID=$(scw rdb backup create instance-id=$SCW_RDB_INSTANCE_ID \ + name=$BACKUP_NAME expires-at=$EXPIRY \ + database-name=ente_db -o json | jq -r '.id') + scw rdb backup wait $BACKUP_ID timeout=5h + scw rdb backup download output=$BACKUP_FILE \ + $(scw rdb backup export $BACKUP_ID --wait -o json | jq -r '.id') +fi + +rclone copy --log-level INFO $BACKUP_FILE $RCLONE_DESTINATION + +# Delete older backups +rclone delete --log-level INFO --min-age 30d $RCLONE_DESTINATION + +set +o xtrace +echo "copycat-db: backup complete: $BACKUP_FILE" diff --git a/infra/copycat-db/src/restore.sh b/infra/copycat-db/src/restore.sh new file mode 100755 index 000000000..8df19c62b --- /dev/null +++ b/infra/copycat-db/src/restore.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -o errexit +set -o xtrace + +# Find the name of the latest backup +# The backup file name contains the epoch, so we can just sort. +BACKUP_FILE=$(rclone lsf --include 'db-*.custom' --files-only $RCLONE_DESTINATION | sort | tail -1) + +# Download it +rclone copy --log-level INFO "${RCLONE_DESTINATION}${BACKUP_FILE}" . + +# Restore from it +# +# This create a database named rdb on Postgres - this is only used for the +# initial connection, the actual ente_db database will be created once the +# restore starts. +# +# Flags: +# +# * no-owner: recreates the schema using the current user, not the one that was +# used for the export. +# +# * no-privileges: skip the assignment of roles (this way we do not have to +# recreate all the users from the original database before proceeding with the +# restore) + +createdb rdb || true +pg_restore -d rdb --create --no-privileges --no-owner --exit-on-error "$BACKUP_FILE" + +# Delete any tokens that were in the backup +psql -d ente_db -c 'delete from tokens' + +# Delete any push tokens that were in the backup +psql -d ente_db -c 'delete from push_tokens' + +# Delete some more temporary data that might've come up in the backup +psql -d ente_db -c 'delete from queue' +psql -d ente_db -c 'delete from temp_objects' + +set +o xtrace +echo "copycat-db: restore complete: $BACKUP_FILE" diff --git a/infra/copycat-db/test.sh b/infra/copycat-db/test.sh new file mode 100755 index 000000000..d4ac1b35f --- /dev/null +++ b/infra/copycat-db/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -o xtrace +set -o errexit + +PROJECT=copycat-db + +docker rmi "ente/$PROJECT" || true +docker build --tag "ente/$PROJECT" . + +# Interactively run the container. +# +# By passing "$@", we allow any arguments passed to test.sh to be forwarded to +# the image (useful for testing out things, e.g. `./test.sh sh`). +docker run \ + --interactive --tty --rm \ + --env-file copycat-db.env \ + --name "$PROJECT" \ + "ente/$PROJECT" \ + "$@"