Skip to content

System Automation with Python

Version: 0.2 Year: 2026


Copyright (c) 2025-2026 Ryan Thomas Robson / Robworks Software LLC. Licensed under CC BY-NC-ND 4.0. You may share this material for non-commercial purposes with attribution, but you may not distribute modified versions.


Python's real power for sysadmins lies in its ability to interact with the operating system. Whether you're managing files, running external commands, parsing arguments, or building monitoring checks, Python provides robust modules that are more reliable, testable, and readable than complex Bash scripts.


File and Directory Operations

os Module Basics

The os module provides low-level operating system interfaces.

import os

# Current working directory
cwd = os.getcwd()

# Environment variables
home = os.getenv("HOME")
debug = os.getenv("DEBUG", "false")    # Default if not set

# List directory contents
for entry in os.listdir("/var/log"):
    full_path = os.path.join("/var/log", entry)
    if os.path.isfile(full_path):
        size = os.path.getsize(full_path)
        print(f"{entry}: {size:,} bytes")

# Create directories
os.makedirs("/tmp/backup/2026/03", exist_ok=True)  # Creates parents, no error if exists

pathlib for Modern File Operations

pathlib provides a cleaner interface for everything os.path does.

from pathlib import Path

# Build paths
log_dir = Path("/var/log")
backup_dir = Path("/tmp/backup") / "2026" / "03"
backup_dir.mkdir(parents=True, exist_ok=True)

# Find files by pattern
for log_file in log_dir.glob("*.log"):
    print(f"{log_file.name}: {log_file.stat().st_size:,} bytes")

# Recursive search
for conf in Path("/etc").rglob("*.conf"):
    print(conf)

shutil for High-Level Operations

shutil handles operations that work on entire files and directory trees.

import shutil

# Copy a file (preserves metadata with copy2)
shutil.copy2("config.yaml", "config.yaml.bak")

# Move a file
shutil.move("temp_data.csv", "/data/archives/")

# Copy an entire directory tree
shutil.copytree("/etc/nginx", "/tmp/nginx-backup")

# Delete a directory tree (equivalent to rm -rf)
shutil.rmtree("/tmp/nginx-backup")

# Create a compressed archive
shutil.make_archive("/tmp/logs-backup", "gztar", "/var/log")
# Creates /tmp/logs-backup.tar.gz

tempfile for Safe Temporary Files

The tempfile module creates temporary files and directories that are cleaned up automatically.

import tempfile

# Temporary file (auto-deleted when closed)
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
    tmp.write('{"status": "processing"}')
    print(f"Temp file: {tmp.name}")

# Temporary directory (auto-deleted when context exits)
with tempfile.TemporaryDirectory() as tmpdir:
    work_file = Path(tmpdir) / "output.txt"
    work_file.write_text("intermediate results")
    # Process files in tmpdir...
# Directory and contents are gone here

Running External Commands

The subprocess module is the standard way to run shell commands from Python.

subprocess.run() - The Default Choice

subprocess.run() executes a command, waits for it to finish, and returns the result.

import subprocess

# Run a command and capture output
result = subprocess.run(
    ["df", "-h", "/"],
    capture_output=True,
    text=True                        # Return strings, not bytes
)

if result.returncode == 0:
    print(result.stdout)
else:
    print(f"Error: {result.stderr}")

Capturing stdout and stderr Separately

# Capture both streams
result = subprocess.run(
    ["systemctl", "status", "nginx"],
    capture_output=True,
    text=True
)

print(f"Exit code: {result.returncode}")
print(f"stdout:\n{result.stdout}")
print(f"stderr:\n{result.stderr}")

Checking Return Codes

# Raise an exception if the command fails
try:
    result = subprocess.run(
        ["nginx", "-t"],
        capture_output=True,
        text=True,
        check=True                   # Raises CalledProcessError on non-zero exit
    )
    print("Nginx config is valid")
except subprocess.CalledProcessError as e:
    print(f"Nginx config error:\n{e.stderr}")

Passing Input to Commands

# Pipe a string as stdin
result = subprocess.run(
    ["grep", "ERROR"],
    input="INFO: All good\nERROR: Disk full\nINFO: Syncing...\nERROR: Timeout",
    capture_output=True,
    text=True
)
print(result.stdout)
# ERROR: Disk full
# ERROR: Timeout

shell=True is a security risk

Never pass user input to subprocess.run() with shell=True. It enables shell injection attacks where a malicious input like ; rm -rf / gets executed. Always pass commands as a list of arguments (which bypasses the shell entirely), or use shlex.quote() if you absolutely must use shell=True.

# DANGEROUS - user_input could contain shell metacharacters
subprocess.run(f"grep {user_input} /var/log/syslog", shell=True)

# SAFE - each argument is passed directly to the program
subprocess.run(["grep", user_input, "/var/log/syslog"])

subprocess.run() vs subprocess.Popen()

Feature run() Popen()
Waits for completion Yes (blocking) No (non-blocking)
Returns CompletedProcess Popen object
Use when You need the result before proceeding You need to interact with the process while it runs

Use run() for 95% of cases. Use Popen() when you need to stream output line by line, send input interactively, or run commands concurrently.

# Popen for streaming output
process = subprocess.Popen(
    ["tail", "-f", "/var/log/syslog"],
    stdout=subprocess.PIPE,
    text=True
)

for line in process.stdout:
    if "ERROR" in line:
        print(f"ALERT: {line.strip()}")
        process.terminate()
        break

Logging

The logging module is Python's built-in logging framework. It replaces print() for anything beyond quick debugging.

import logging

# Basic configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

logger = logging.getLogger(__name__)

logger.info("Service check started")
logger.warning("Disk usage at 85%%")
logger.error("Connection to db01 failed")
logger.critical("All backend servers unreachable")

Logging to a File

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("/var/log/my-tool.log"),
        logging.StreamHandler()      # Also print to console
    ]
)

Use logging instead of print() for operational scripts

print() goes to stdout and has no concept of severity levels, timestamps, or output routing. The logging module gives you all three, plus the ability to change verbosity without modifying code (--verbose sets level to DEBUG, default is INFO, --quiet sets WARNING). For one-off scripts during development, print() is fine. For anything that runs in production, use logging.


Command-Line Arguments

sys.argv for Simple Scripts

import sys

if len(sys.argv) < 2:
    print(f"Usage: {sys.argv[0]} <hostname>")
    sys.exit(1)

hostname = sys.argv[1]
print(f"Checking {hostname}...")

argparse for Production Tools

The argparse module builds CLI interfaces with help text, type validation, default values, and subcommands.

import argparse

parser = argparse.ArgumentParser(
    description="Archive and compress old log files."
)
parser.add_argument(
    "directory",
    help="Directory containing logs to archive"
)
parser.add_argument(
    "--days", type=int, default=7,
    help="Archive logs older than this many days (default: 7)"
)
parser.add_argument(
    "--compress", choices=["gzip", "bzip2", "none"], default="gzip",
    help="Compression method (default: gzip)"
)
parser.add_argument(
    "-v", "--verbose", action="store_true",
    help="Enable verbose output"
)
parser.add_argument(
    "-n", "--dry-run", action="store_true",
    help="Show what would be done without making changes"
)

args = parser.parse_args()

print(f"Archiving logs in {args.directory} older than {args.days} days")
if args.verbose:
    print(f"Compression: {args.compress}")
if args.dry_run:
    print("(dry run - no changes will be made)")
$ python3 archive_logs.py --help
usage: archive_logs.py [-h] [--days DAYS] [--compress {gzip,bzip2,none}]
                       [-v] [-n] directory

Archive and compress old log files.

positional arguments:
  directory             Directory containing logs to archive

options:
  -h, --help            show this help message and exit
  --days DAYS           Archive logs older than this many days (default: 7)
  --compress {gzip,bzip2,none}
                        Compression method (default: gzip)
  -v, --verbose         Enable verbose output
  -n, --dry-run         Show what would be done without making changes

Environment Variables

Environment variables pass configuration to scripts without hardcoding values or using config files.

import os

# Read with defaults
db_host = os.getenv("DB_HOST", "localhost")
db_port = int(os.getenv("DB_PORT", "5432"))
debug = os.getenv("DEBUG", "false").lower() == "true"

# Require a variable (fail fast if missing)
api_key = os.environ["API_KEY"]    # Raises KeyError if not set

# Safer pattern
api_key = os.getenv("API_KEY")
if not api_key:
    print("Error: API_KEY environment variable is required")
    sys.exit(1)

Real-World Pattern: Service Health Checker

#!/usr/bin/env python3
"""Check the health of services and report status."""

import argparse
import logging
import subprocess
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger("healthcheck")

def check_service(name):
    """Check if a systemd service is active."""
    result = subprocess.run(
        ["systemctl", "is-active", name],
        capture_output=True, text=True
    )
    return result.stdout.strip() == "active"

def check_port(host, port):
    """Check if a TCP port is reachable."""
    result = subprocess.run(
        ["timeout", "3", "bash", "-c", f"echo > /dev/tcp/{host}/{port}"],
        capture_output=True
    )
    return result.returncode == 0

def main():
    parser = argparse.ArgumentParser(description="Service health checker")
    parser.add_argument("--services", nargs="+", default=["nginx", "postgresql"])
    parser.add_argument("--check-ports", nargs="+", help="host:port pairs to check")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    failures = []

    for service in args.services:
        if check_service(service):
            logger.info(f"{service}: OK")
        else:
            logger.error(f"{service}: DOWN")
            failures.append(service)

    if args.check_ports:
        for pair in args.check_ports:
            host, port = pair.split(":")
            if check_port(host, int(port)):
                logger.info(f"{host}:{port}: REACHABLE")
            else:
                logger.error(f"{host}:{port}: UNREACHABLE")
                failures.append(pair)

    if failures:
        logger.critical(f"Failed checks: {', '.join(failures)}")
        sys.exit(2)
    else:
        logger.info("All checks passed")
        sys.exit(0)

if __name__ == "__main__":
    main()


Interactive Quizzes




Further Reading


Previous: Working with Files and APIs | Next: Testing and Tooling | Back to Index

Comments