From 2388610574e08a1587e62e961d00645220588305 Mon Sep 17 00:00:00 2001 From: Marc LeBlanc Date: Sat, 14 Dec 2024 18:08:06 -0700 Subject: [PATCH] Adding notes on git gc commands --- repo-converter/build/sg_maintenance.sh | 226 ++++++++++++++++++++++--- 1 file changed, 202 insertions(+), 24 deletions(-) diff --git a/repo-converter/build/sg_maintenance.sh b/repo-converter/build/sg_maintenance.sh index 2817129..27cecb1 100644 --- a/repo-converter/build/sg_maintenance.sh +++ b/repo-converter/build/sg_maintenance.sh @@ -1,10 +1,28 @@ #!/usr/bin/env bash +############################################################################### +# WARNING +# This script was created for Sourcegraph Implementation Engineering deployments +# and is not intended, designed, built, or supported for use in any other scenario. +# Feel free to open issues or PRs, but responses are best effort. +############################################################################### + +# Description: # Custom implementation of sg_maintenance.sh -# Original: https://github.com/sourcegraph/sourcegraph/blob/3.40/cmd/gitserver/server/sg_maintenance.sh +# For a customer running p4-fusion to convert massive repos from Perforce to Git +# p4-fusion does not duplicate Git CLI functionality, including garbage collection +# Originally written on 2022-08-04, using v3.40 version of sg_maintenance.sh as a base +# https://github.com/sourcegraph/sourcegraph/blob/3.40/cmd/gitserver/server/sg_maintenance.sh # Retrieved from customer Slack channel on 2024-12-13 -# -# The differences are: +# The base sg_maintenance.sh script hasn't changed up to v5.10.2832 +# https://github.com/sourcegraph/sourcegraph/blob/v5.10.2832/cmd/gitserver/internal/sg_maintenance.sh +# However, since migrating to Bazel, the script has been built into the gitserver Go binary, +# and is no longer on gitserver's volume to find and execute manually, +# so, customers need to copy this script from here, and paste it into their +# gitserver's sourcegraph user's home directory, at /home/sourcegraph/sg_maintenance.sh + +# The customizations are to make the script more ergonomic to execute manually +# for customers who have disabled Git's garbage collection # 1. This script can be stored in /home/sourcegraph on gitserver's volume, # and executed from there # 2. The path to the needed git repo can be passed in as a script parameter, @@ -13,71 +31,231 @@ # which should prevent Sourcegraph's background processes from running on the repo while this script is running, # and git gc +############################################################################### +# sg_maintenance.sh additions start here +############################################################################### + +# Configure Bash options +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# -e Exit if there are any errors +# -u Exit if a variable is referenced before assigned +# -x Print out commands before they are executed +# -o pipefail Exit if a command piped into another command fails set -euxo pipefail +# Grab the repo's root folder as the first parameter +# Bash string manipulation to set the variable value as empty if no parameters are provided REPOSITORY_FOLDER="${1:-""}" + +# If the directory doesn't exist if [ -z "${REPOSITORY_FOLDER}" ]; then + + # Print usage instructions and exit echo "USAGE: $(basename "${BASH_SOURCE[0]}") [REPOSITORY_ROOT_FOLDER]" exit 1 + fi + +# cd to the provided repo cd "$REPOSITORY_FOLDER" +# Track files to be cleaned up on exit declare -a files_to_cleanup +# Cleanup function to be run on exit +# This function may fail to remove files, if removing a previous file fails? function cleanup() { for file in "${files_to_cleanup[@]}"; do rm "$file" || true done } + +# Configure the exit trap to run the cleanup function trap cleanup EXIT +# Enable noclobber, so output redirection will not overwrite existing files set -o noclobber -# pause all cleanup jobs, including garbage collection +# Create the SG_PAUSE lock file in the repo directory +# to prevent concurrent Sourcegraph cleanup jobs from starting +# With noclobber enabled, this should fail and exit the script if the file already exists, +# i.e. if a Sourcegraph cleanup job is already running echo "running sg maintenance manually" >SG_PAUSE -# cleanup the pause file once the script is done +# Cleanup the SG_PAUSE file once the script is done files_to_cleanup+=("SG_PAUSE") -# set the 'git gc' pause file to prevent concurrent gc jobs +# Create the gc.pid lock file in the repo directory +# to prevent concurrent git garbage collection jobs from starting +# With noclobber enabled, this should fail and exit the script if the file already exists, +# i.e. if a Sourcegraph cleanup job is already running echo "1 $(hostname)" >.git/gc.pid +# Disable noclobber, so output redirection will overwrite existing files set +o noclobber -# try running 'git gc' (expecting to it fail) to confirm that our lock file works as expected +# Test running a concurrent 'git gc', expecting to it fail, to validate that our lock file works as expected +# Exit the script if git gc doesn't fail if git gc &>/dev/null; then echo "expected 'git gc' to fail, but it didn't. Please inspect the .git/gc.pid lockfile to confirm that it contains the correct contents." exit 1 fi -# cleanup the 'git gc' lock file once the script is done +# Cleanup the 'git gc' lock file once the script is done files_to_cleanup+=(".git/gc.pid") -# Run sg_maintenance.sh steps from https://github.com/sourcegraph/sourcegraph/blob/3.40/cmd/gitserver/server/sg_maintenance.sh +############################################################################### +# sg_maintenance.sh additions mostly end here +# Run sg_maintenance.sh steps from +# https://github.com/sourcegraph/sourcegraph/blob/v5.10.2832/cmd/gitserver/internal/sg_maintenance.sh +# with minor changes, as noted +############################################################################### + +#!/usr/bin/env sh +# This script runs several git commands with the goal to optimize the +# performance of git for large repositories. +# +# Relation to git gc and git maintenance: +# +# git-gc +# ------ +# The order of commands in this script is based on the order in which git gc +# calls the same commands. The following is a list of commands based on running +# "GIT_TRACE2=1 git gc". +# +# git pack-refs --all --prune +# git reflog expire --all +# git repack -d -l --cruft --cruft-expiration=2.weeks.ago +# -> git pack-objects --local --delta-base-offset .git/objects/pack/.tmp-73874-pack --keep-true-parents --honor-pack-keep --non-empty --all --reflog --indexed-objects +# -> git pack-objects --local --delta-base-offset .git/objects/pack/.tmp-73874-pack --cruft --cruft-expiration=2.weeks.ago --honor-pack-keep --non-empty --max-pack-size=0 +# git prune --expire 2.weeks.ago +# git worktree prune --expire 3.months.ago +# git rerere gc +# commit-graph (not traced) +# +# We deviate from git gc like follows: +# - For "git repack" and "git commit-graph write" we choose a different set of +# flags. +# - We omit the commands "git rerere" and "git worktree prune" because they +# don't apply to our use-case. +# +# git-maintenance +# --------------- +# As of git 2.34.1, it is not possible to sufficiently fine-tune the tasks git +# maintenance runs. The tasks are configurable with git config, but not all +# flags are exposed as config parameters. For example, the task +# "incremental-repack" does not allow setting --geometric=2. If future releases +# of git allow us to set more parameters for "git maintenance", we should +# consider switching from this script to "git maintenance". + +############################################################################### +# sg_maintenance.sh customization +# set -xe +# Commented out because Bash options were set in earlier additions +############################################################################### # Usually run by git gc. Pack heads and tags for efficient repository access. # --all Pack branch tips as well. Useful for a repository with many branches of # historical interest. +############################################################################### +# Marc's notes from https://git-scm.com/docs/git-pack-refs +# --all to pack all refs (branches, tags, and HEAD) +# There's no arg for --prune, this seems to be the default behaviour +# There is an arg for --no-prune, but that's not what we need +# Marc to test if the --prune arg causes any issues +############################################################################### git pack-refs --all --prune # Usually run by git gc. The "expire" subcommand prunes older reflog entries. # Entries older than expire time, or entries older than expire-unreachable time # and not reachable from the current tip, are removed from the reflog. # --all Process the reflogs of all references +############################################################################### +# Comment from Eng: +# We may want to revisit expiring objects immediately +# +# Marc's notes from https://git-scm.com/docs/git-reflog +# Reference logs ("reflogs"), record when the tips of branches and other references +# were updated in the local repository. +# Reflogs are useful in various Git commands, to specify the old value of a reference. +# For example, +# HEAD@{2} means "where HEAD used to be, two moves ago", +# master@{one.week.ago} means "where master used to point to, one week ago" +# +# The "expire" subcommand prunes older reflog entries. +# Entries older than expire time, or +# entries older than expire-unreachable time and not reachable from the current tip, +# are removed from the reflog. +# +# --expire=