Welcome to the Cloudshalla Engineering Blog! We break down the real, unfiltered truths of DevOps, Cloud, and Platform Engineering fresh from the production trenches. If you are serious about stepping up your career, you are in exactly the right place.

The 3 Hours I Was Wasting Every Day

Cloud Engineering Architecture

Before I automated: manual log cleanup on 8 servers, manually checking if backup jobs ran, copy-pasting health check results into a Slack channel, running disk usage checks and sending emails. 3 hours. Every. Single. Day. I spent one Saturday and never did those tasks manually again.

Script 1: Automated S3 Backup with Verification

#!/usr/bin/env python3
import boto3
import os
import subprocess
from datetime import datetime
import requests  # For Slack webhook

SLACK_WEBHOOK = os.environ['SLACK_WEBHOOK_URL']
S3_BUCKET = 'my-company-backups'
DB_NAME = 'production_db'

def backup_database():
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'{DB_NAME}_{timestamp}.sql.gz'
    
    # Create dump
    dump_cmd = f'pg_dump {DB_NAME} | gzip > /tmp/{filename}'
    result = subprocess.run(dump_cmd, shell=True, capture_output=True)
    
    if result.returncode != 0:
        notify_slack(f'❌ Backup FAILED: {result.stderr.decode()}', 'danger')
        return False
    
    # Upload to S3
    s3 = boto3.client('s3')
    s3.upload_file(f'/tmp/{filename}', S3_BUCKET, f'db-backups/{filename}')
    
    # Verify upload
    response = s3.head_object(Bucket=S3_BUCKET, Key=f'db-backups/{filename}')
    size_mb = response['ContentLength'] / 1024 / 1024
    
    notify_slack(f'✅ Backup SUCCESS: {filename} ({size_mb:.1f} MB)', 'good')
    os.remove(f'/tmp/{filename}')
    return True

def notify_slack(message, color='good'):
    payload = {
        'attachments': [{'text': message, 'color': color}]
    }
    requests.post(SLACK_WEBHOOK, json=payload)

if __name__ == '__main__':
    backup_database()

Script 2: Multi-Server Health Check

#!/usr/bin/env python3
import requests
import json
from concurrent.futures import ThreadPoolExecutor

SERVICES = [
    {'name': 'API Gateway', 'url': 'https://api.myapp.com/health'},
    {'name': 'Auth Service', 'url': 'https://auth.myapp.com/health'},
    {'name': 'Payment Service', 'url': 'https://payment.myapp.com/health'},
]

def check_service(service):
    try:
        resp = requests.get(service['url'], timeout=5)
        return {
            'name': service['name'],
            'status': '✅' if resp.status_code == 200 else '⚠️',
            'latency': f"{resp.elapsed.total_seconds()*1000:.0f}ms",
            'code': resp.status_code
        }
    except Exception as e:
        return {'name': service['name'], 'status': '❌', 'error': str(e)}

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(check_service, SERVICES))

print(json.dumps(results, indent=2))

Cron Schedule Setup

# crontab -e
# Run backup every day at 2:00 AM
0 2 * * * /usr/bin/python3 /opt/scripts/backup.py >> /var/log/backup.log 2>&1

# Health check every 5 minutes
*/5 * * * * /usr/bin/python3 /opt/scripts/healthcheck.py >> /var/log/health.log 2>&1

# Log cleanup every Sunday at 3 AM
0 3 * * 0 find /var/log/myapp -name "*.log" -mtime +30 -delete

# Disk usage alert if above 80%
*/10 * * * * df -h | awk '$5 > 80 {print}' | python3 /opt/scripts/disk_alert.py
💡 Real talk: The ROI on automation is infinite. 1 weekend of scripting = 3 hours saved daily forever. That's 60+ hours a month you can spend on actual engineering work instead of babysitting infrastructure.

Ready to stop learning theory and start building real projects? Join the Cloudshalla masterclasses to get 1-on-1 mentorship, break into top-tier DevOps roles, and master cloud automation today.