사내에서 ZFS라는 파일 시스템을 이용해 수집서버에 EBS여러개를 붙여 데이터를 보관하고있었습니다.
또한 가용성을 높이기 위해 EBS SNAPSHOT을 매일 찍어 보관하고 있었습니다.
문제는 AWS EBS SNAPSHOT의 가격이 너무 비쌌습니다!!!
(전체 AWS 비용의 약 20%이상)
zfs snapshot yardbase@2023-10-17
zfs send -I yardbase@2023-10-16 yardbase@2023-10-17 > {location}
aws s3 cp {backupfile} {backup bucket}
yardbase@2023-10-16(전일) 스냅샷을 찍은 순간부터 yardbase@2023-10-17(금일)까지의 변경사항을 지정된 위치에 저장
가격 절감을 위해 AWS S3에 저장
zfs snapshot destroy yardbase@2023-10-16
#!/bin/bash
LOG_FILE="/var/log/snapshot.log"
SNAPSHOT_DIR="/data/yardbase"
S3_BUCKET="yard-backup"
ZFS_POOL="yardbase"
SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
YESTERDAY="$(date -d 'yesterday' +'%Y-%m-%d')"
TODAY="$(date +'%Y-%m-%d')"
send_slack_notification() {
local message="${1}"
local hostname="$(hostname)"
local payload="{
\"text\": \"${message}\",
\"username\": \"Snapshot Bot\",
\"icon_emoji\": \":exclamation:\",
\"attachments\": [{
\"text\": \"Hostname: ${hostname}\"
}]
}"
curl -X POST -H "Content-type: application/json" --data "${payload}" "${SLACK_WEBHOOK_URL}"
}
func_make_snapshot()
{
local snapshot_name="${1}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start making snapshot for ${snapshot_name}" >> "${LOG_FILE}"
if zfs snapshot "${ZFS_POOL}@${snapshot_name}" >> "${LOG_FILE}" 2>&1; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot for ${snapshot_name} created successfully" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to create snapshot for ${snapshot_name}" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to create snapshot for ${snapshot_name}"
exit 1
fi
}
func_send_snapshot_s3()
{
local current_snapshot="${1}"
local previous_snapshot="${2}"
local my_hostname="$(hostname)"
local snapshot_file="${SNAPSHOT_DIR}/${current_snapshot}-snap"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start sending snapshot to S3" >> "${LOG_FILE}"
if ionice -c3 zfs send --raw -I "${ZFS_POOL}@${previous_snapshot}" "${ZFS_POOL}@${current_snapshot}" > "${snapshot_file}" 2>> "${LOG_FILE}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
upload_id=$(aws s3api create-multipart-upload --bucket "${S3_BUCKET}" --key "${current_snapshot}/${my_hostname}-snap" | jq -r '.UploadId')
part_number=1
while true; do
ionice -c3 aws s3api upload-part --bucket "${S3_BUCKET}" --key "${current_snapshot}/${my_hostname}-snap" --part-number ${part_number} --upload-id "${upload_id}" --body "${snapshot_file}" >> "${LOG_FILE}" 2>&1 || break
((part_number++))
done
parts="["
for ((i = 1; i < part_number; i++)); do
if [ "${i}" -ne 1 ]; then
parts+=", "
fi
parts+="{\"ETag\":\"$(aws s3api list-parts --bucket "${S3_BUCKET}" --key "${current_snapshot}/${my_hostname}-snap" --upload-id "${upload_id}" | jq -r --argjson i "${i}" '.Parts[] | select(.PartNumber == $i) | .ETag')\",\"PartNumber\":${i}}"
done
parts+="]"
aws s3api complete-multipart-upload --bucket "${S3_BUCKET}" --key "${current_snapshot}/${my_hostname}-snap" --upload-id "${upload_id}" --multipart-upload "{\"Parts\": ${parts}}" >> "${LOG_FILE}" 2>&1 || echo "Failed to complete multipart upload" >> "${LOG_FILE}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Completed multipart upload" >> "${LOG_FILE}"
rm -f "${snapshot_file}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Removed local snapshot file" >> "${LOG_FILE}"
}
func_delete_snapshot()
{
local snapshot_name="${1}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Deleting snapshot for ${snapshot_name}" >> "${LOG_FILE}"
if zfs destroy "${ZFS_POOL}@${snapshot_name}" >> "${LOG_FILE}" 2>&1; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot ${snapshot_name} deleted successfully" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to delete snapshot ${snapshot_name}" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to delete snapshot ${snapshot_name}"
fi
}
if func_make_snapshot "${TODAY}"; then
if func_send_snapshot_s3 "${TODAY}" "${YESTERDAY}"; then
func_delete_snapshot "${YESTERDAY}"
fi
fi
zfs send로 ionice로 DISK 작업의 우선순위를 낮춰도 DISK IO를 너무 많이 잡아먹어 다른 서비스의 영향이 갈 수 있음.
CGROUP을 사용하여 특정 disk에 대한 read/write를 제한하자
mkdir /sys/fs/cgroup/io_limit_nvme1n1
sudo sh -c "echo '259:0 rbps=100000' > /sys/fs/cgroup/io_limit_nvme1n1/io.max"
sudo sh -c "echo '259:0 wbps=100000' > /sys/fs/cgroup/io_limit_nvme1n1/io.max"
/sys/fs/cgoup/io_limit_nvme1n1/cgoup.procs에 pid를 넣으면 해당 Process는 /dev/nvme1n1 disk에 대한 read/write가 100mbps로 제한됨
Systemd-run로 zfs send실행 후 pid /sys/fs/cgoup/io_limit_nvme1n1/cgoup.procs에 등록
초기 코드
if systemd-run --unit="system_send_zfs_${current_snapshot}" --quiet bash -c 'zfs send --raw -I '"${ZFS_POOL}"'@'"${previous_snapshot}"' '"${ZFS_POOL}"'@'"${current_snapshot}"' > '"${snapshot_dir}"' 2>> '"${LOG_FILE}"''; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
pid=$(systemctl show -p MainPID --value system_send_zfs_${current_snapshot})
echo "$pid" > /sys/fs/cgroup/io_limit_nvme1n1/cgroup.procs
while true; do
systemctl is-active --quiet "system_send_zfs_${current_snapshot}"
if [ $? -ne 0 ]; then
break
fi
sleep 1
done
→파일을 떨어트리는 > 부분을 넣을 수 없음
if systemd-run --unit="system_send_zfs_${current_snapshot}" --quiet /bin/bash /home/jenkins/zfssend.sh; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
pid=$(systemctl show -p MainPID --value system_send_zfs_${current_snapshot})
echo "$pid" >> "${LOG_FILE}"
echo "$pid" > /sys/fs/cgroup/io_limit_nvme1n1/cgroup.procs
while true; do
systemctl is-active --quiet "system_send_zfs_${current_snapshot}"
if [ $? -ne 0 ]; then
break
fi
sleep 1
done
디스크 복구 시 증분백업의 기준점이 되는 스냅샷이 있어야 복구 가
능
cannot receive incremental stream: most recent snapshot of yardbase does not
match incremental source
매달 1일에 증분백업이 아닌 일반 백업을 실시
초기코드
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start sending snapshot to S3" >> "${LOG_FILE}"
if systemd-run --unit="system_send_zfs_${current_snapshot}" --quiet /bin/bash "${zfssend_script}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
수정 코드
if [ "$(date +'%d')" -eq "01" ]; then
zfssend_script="/home/jenkins/zfssendall.sh"
else
zfssend_script="/home/jenkins/zfssend.sh"
fi
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start sending snapshot to S3" >> "${LOG_FILE}"
if systemd-run --unit="system_send_zfs_${current_snapshot}" --quiet /bin/bash "${zfssend_script}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
#!/bin/bash
LOG_FILE="/var/log/snapshot.log"
SNAPSHOT_DIR="/data/yardbase/snapshot"
S3_BUCKET="yard-backup"
ZFS_POOL="yardbase"
SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
YESTERDAY="$(date -d 'yesterday' +'%Y-%m-%d')"
TODAY="$(date +'%Y-%m-%d')"
send_slack_notification() {
local message="${1}"
local hostname="$(hostname)"
local payload="{
\"text\": \"${message}\",
\"username\": \"Snapshot Bot\",
\"icon_emoji\": \":exclamation:\",
\"attachments\": [{
\"text\": \"Hostname: ${hostname}\"
}]
}"
curl -X POST -H "Content-type: application/json" --data "${payload}" "${SLACK_WEBHOOK_URL}"
}
func_make_snapshot()
{
local snapshot_name="${1}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start making snapshot for ${snapshot_name}" >> "${LOG_FILE}"
if zfs snapshot "${ZFS_POOL}@${snapshot_name}" >> "${LOG_FILE}" 2>&1; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot for ${snapshot_name} created successfully" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to create snapshot for ${snapshot_name}" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to create snapshot for ${snapshot_name}"
exit 1
fi
}
func_send_snapshot_s3() {
local current_snapshot="${1}"
local previous_snapshot="${2}"
local my_hostname="$(hostname)"
local snapshot_dir="${SNAPSHOT_DIR}/${current_snapshot}-snap"
# 월의 1일인 경우에만 zfssendall.sh 실행
if [ "$(date +'%d')" -eq "01" ]; then
zfssend_script="/home/jenkins/zfssendall.sh"
else
zfssend_script="/home/jenkins/zfssend.sh"
fi
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Start sending snapshot to S3" >> "${LOG_FILE}"
if systemd-run --unit="system_send_zfs_${current_snapshot}" --quiet /bin/bash "${zfssend_script}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
pid=$(systemctl show -p MainPID --value system_send_zfs_${current_snapshot})
echo "$pid" >> "${LOG_FILE}"
echo "$pid" > /sys/fs/cgroup/io_limit_nvme1n1/cgroup.procs
while true; do
systemctl is-active --quiet "system_send_zfs_${current_snapshot}"
if [ $? -ne 0 ]; then
break
fi
sleep 1
done
ionice -c3 nice -n 19 aws s3 sync "${SNAPSHOT_DIR}" "s3://${S3_BUCKET}/${current_snapshot}/" >> "${LOG_FILE}" 2>&1
rm -rf "${snapshot_dir}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Removed local snapshot directory" >> "${LOG_FILE}"
}
func_delete_snapshot()
{
local snapshot_name="${1}"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Deleting snapshot for ${snapshot_name}" >> "${LOG_FILE}"
if zfs destroy "${ZFS_POOL}@${snapshot_name}" >> "${LOG_FILE}" 2>&1; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot ${snapshot_name} deleted successfully" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to delete snapshot ${snapshot_name}" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to delete snapshot ${snapshot_name}"
fi
}
if func_make_snapshot "${TODAY}"; then
if func_send_snapshot_s3 "${TODAY}" "${YESTERDAY}"; then
func_delete_snapshot "${YESTERDAY}"
fi
fi
#!/bin/bash
ZFS_POOL="yardbase"
previous_snapshot="$(date -d 'yesterday' +'%Y-%m-%d')"
current_snapshot="$(date +'%Y-%m-%d')"
snapshot_dir="/data/yardbase/snapshot/${current_snapshot}-snap"
LOG_FILE="/var/log/snapshot.log"
if zfs send --raw -I "${ZFS_POOL}@${previous_snapshot}" "${ZFS_POOL}@${current_snapshot}" > "${snapshot_dir}" 2>> "${LOG_FILE}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
#!/bin/bash
ZFS_POOL="yardbase"
previous_snapshot="$(date -d 'yesterday' +'%Y-%m-%d')"
current_snapshot="$(date +'%Y-%m-%d')"
my_hostname="$(hostname)"
snapshot_dir="/data/yardbase/snapshot/${my_hostname}-${current_snapshot}-snap"
LOG_FILE="/var/log/snapshot.log"
if zfs send --raw "${ZFS_POOL}@${current_snapshot}" > "${snapshot_dir}" 2>> "${LOG_FILE}"; then
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Snapshot saved locally" >> "${LOG_FILE}"
else
echo "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally" >> "${LOG_FILE}"
send_slack_notification "[$(date +'%Y-%m-%d %H:%M:%S')] Failed to save snapshot locally"
exit 1
fi
zfs 설치
sudo add-apt-repository ppa:jonathonf/zfs -y
sudo apt-get update -y
sudo apt-get upgrade -y
sudo apt install zfs-dkms -y
sudo reboot
zfs storage pool 생성
zpool create -m /data/yardbase yardbase -o ashift=12 /dev/nvme1n1
presinged url로 스냅샷 다운로드
wget "${presinedurl}" -O database-2023-10-14-snap
dataset생성
zfs create yardbase/recover
zfs set mountpoint=/data/yardbase2 yardbase/recover
스냅샷 복구
zfs receive -F yardbase/recover < /data/yardbase/database-2023-10-14-snap
원하는 날까지 3,5 반복
5번 단계를 맨처음 진행할 시, 증분백업 파일이 아니여야함.
echo "$pid" > /sys/fs/cgroup/io_limit_nvme1n1/cgroup.procs
echo "$pid" > /sys/fs/cgroup/io_limit_nvme2n1/cgroup.procs
echo "$pid" > /sys/fs/cgroup/io_limit_nvme3n1/cgroup.procs