Initial commit: Disaster recovery CLI tool

A Go-based CLI tool for recovering servers from backups to new cloud VMs. Features: - Multi-cloud support: Exoscale, Cloudscale, Hetzner Cloud - Backup sources: Local filesystem, Hetzner Storage Box - 6-stage restore pipeline with /etc whitelist protection - DNS migration with safety checks and auto-rollback - Dry-run by default, requires --yes to execute - Cloud-init for SSH key injection Packages: - cmd/recover-server: CLI commands (recover, migrate-dns, list, cleanup) - internal/providers: Cloud provider implementations - internal/backup: Backup source implementations - internal/restore: 6-stage restore pipeline - internal/dns: Exoscale DNS management - internal/ui: Prompts, progress, dry-run display - internal/config: Environment and host configuration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-08 00:31:27 +00:00
commit 29a2886402
26 changed files with 3826 additions and 0 deletions
--- a/internal/restore/docker.go
+++ b/internal/restore/docker.go
@@ -0,0 +1,102 @@
+package restore
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+)
+
+// startDocker ensures Docker is running and starts compose stacks
+func (p *Pipeline) startDocker(ctx context.Context) error {
+	// Ensure Docker is enabled and running
+	if err := p.remoteCmd(ctx, "systemctl enable docker"); err != nil {
+		return fmt.Errorf("failed to enable docker: %w", err)
+	}
+
+	if err := p.remoteCmd(ctx, "systemctl start docker"); err != nil {
+		return fmt.Errorf("failed to start docker: %w", err)
+	}
+
+	// Wait for Docker to be ready
+	for i := 0; i < 30; i++ {
+		if err := p.remoteCmd(ctx, "docker info > /dev/null 2>&1"); err == nil {
+			break
+		}
+		time.Sleep(time.Second)
+	}
+
+	// Find and start docker-compose stacks
+	findCmd := "find /opt -name 'docker-compose.yml' -o -name 'docker-compose.yaml' -o -name 'compose.yml' -o -name 'compose.yaml' 2>/dev/null | head -20"
+	output, err := p.remoteCmdOutput(ctx, findCmd)
+	if err != nil || strings.TrimSpace(output) == "" {
+		if p.Verbose {
+			fmt.Println("  No docker-compose files found")
+		}
+		return nil
+	}
+
+	composeFiles := strings.Split(strings.TrimSpace(output), "\n")
+	for _, file := range composeFiles {
+		if file == "" {
+			continue
+		}
+
+		// Get directory containing compose file
+		dir := file[:strings.LastIndex(file, "/")]
+
+		if p.Verbose {
+			fmt.Printf("  Starting compose stack in %s\n", dir)
+		}
+
+		// Try docker compose (v2) first, fall back to docker-compose (v1)
+		startCmd := fmt.Sprintf("cd %s && (docker compose up -d 2>/dev/null || docker-compose up -d)", dir)
+		if err := p.remoteCmd(ctx, startCmd); err != nil {
+			if p.Verbose {
+				fmt.Printf("    Warning: failed to start stack in %s: %v\n", dir, err)
+			}
+			// Don't fail on individual stack failures
+		}
+	}
+
+	return nil
+}
+
+// runHealth performs health verification
+func (p *Pipeline) runHealth(ctx context.Context) error {
+	checks := []struct {
+		name    string
+		cmd     string
+		require bool
+	}{
+		{"SSH accessible", "echo ok", true},
+		{"Docker running", "docker info > /dev/null 2>&1 && echo ok", true},
+		{"Network connectivity", "ping -c 1 8.8.8.8 > /dev/null 2>&1 && echo ok", false},
+		{"DNS resolution", "host google.com > /dev/null 2>&1 && echo ok", false},
+	}
+
+	var failures []string
+
+	for _, check := range checks {
+		output, err := p.remoteCmdOutput(ctx, check.cmd)
+		success := err == nil && strings.TrimSpace(output) == "ok"
+
+		status := "✓"
+		if !success {
+			status = "✗"
+			if check.require {
+				failures = append(failures, check.name)
+			}
+		}
+
+		if p.Verbose {
+			fmt.Printf("  %s %s\n", status, check.name)
+		}
+	}
+
+	if len(failures) > 0 {
+		return fmt.Errorf("required health checks failed: %v", failures)
+	}
+
+	return nil
+}
--- a/internal/restore/pipeline.go
+++ b/internal/restore/pipeline.go
@@ -0,0 +1,130 @@
+package restore
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"recover-server/internal/backup"
+	"recover-server/internal/providers"
+)
+
+// Stage represents a restore stage
+type Stage int
+
+const (
+	StageSync Stage = iota + 1
+	StageEtc
+	StageSelectiveEtc
+	StageSSHKeys
+	StageServices
+	StageHealth
+)
+
+func (s Stage) String() string {
+	names := map[Stage]string{
+		StageSync:         "Sync /root and /opt",
+		StageEtc:          "Stage /etc backup",
+		StageSelectiveEtc: "Selective /etc restore",
+		StageSSHKeys:      "Merge SSH keys",
+		StageServices:     "Start services",
+		StageHealth:       "Health verification",
+	}
+	return names[s]
+}
+
+// StageResult contains the result of a stage execution
+type StageResult struct {
+	Stage     Stage
+	Success   bool
+	Message   string
+	Duration  time.Duration
+	Error     error
+}
+
+// Pipeline orchestrates the restore process
+type Pipeline struct {
+	VM           *providers.VM
+	BackupSource backup.BackupSource
+	HostName     string
+	SSHKeyPath   string      // Path to ephemeral private key
+	SSHUser      string      // Usually "root"
+	DryRun       bool
+	Verbose      bool
+
+	results []StageResult
+}
+
+// NewPipeline creates a new restore pipeline
+func NewPipeline(vm *providers.VM, source backup.BackupSource, host, sshKeyPath string) *Pipeline {
+	return &Pipeline{
+		VM:           vm,
+		BackupSource: source,
+		HostName:     host,
+		SSHKeyPath:   sshKeyPath,
+		SSHUser:      "root",
+		results:      make([]StageResult, 0),
+	}
+}
+
+// Run executes all stages
+func (p *Pipeline) Run(ctx context.Context) error {
+	stages := []struct {
+		stage Stage
+		fn    func(context.Context) error
+	}{
+		{StageSync, p.runSync},
+		{StageEtc, p.runEtcStaging},
+		{StageSelectiveEtc, p.runSelectiveEtc},
+		{StageSSHKeys, p.runSSHKeyMerge},
+		{StageServices, p.runServices},
+		{StageHealth, p.runHealth},
+	}
+
+	for _, s := range stages {
+		start := time.Now()
+
+		if p.Verbose {
+			fmt.Printf("\n=== Stage %d: %s ===\n", s.stage, s.stage)
+		}
+
+		if p.DryRun {
+			p.results = append(p.results, StageResult{
+				Stage:    s.stage,
+				Success:  true,
+				Message:  "[DRY RUN] Would execute: " + s.stage.String(),
+				Duration: 0,
+			})
+			continue
+		}
+
+		err := s.fn(ctx)
+		result := StageResult{
+			Stage:    s.stage,
+			Success:  err == nil,
+			Duration: time.Since(start),
+			Error:    err,
+		}
+
+		if err != nil {
+			result.Message = err.Error()
+			p.results = append(p.results, result)
+			return fmt.Errorf("stage %d (%s) failed: %w", s.stage, s.stage, err)
+		}
+
+		result.Message = "Completed successfully"
+		p.results = append(p.results, result)
+	}
+
+	return nil
+}
+
+// Results returns all stage results
+func (p *Pipeline) Results() []StageResult {
+	return p.results
+}
+
+// sshTarget returns the SSH target string
+func (p *Pipeline) sshTarget() string {
+	return fmt.Sprintf("%s@%s", p.SSHUser, p.VM.PublicIP)
+}
--- a/internal/restore/ssh.go
+++ b/internal/restore/ssh.go
@@ -0,0 +1,110 @@
+package restore
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"encoding/pem"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"golang.org/x/crypto/ssh"
+)
+
+// SSHKeyPair holds an ephemeral SSH key pair
+type SSHKeyPair struct {
+	PrivateKeyPath string
+	PublicKey      string
+}
+
+// GenerateEphemeralKey creates a temporary ED25519 SSH key pair
+func GenerateEphemeralKey() (*SSHKeyPair, error) {
+	// Generate ED25519 key pair
+	pubKey, privKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate key: %w", err)
+	}
+
+	// Convert to SSH format
+	sshPubKey, err := ssh.NewPublicKey(pubKey)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create SSH public key: %w", err)
+	}
+
+	// Marshal public key
+	pubKeyStr := strings.TrimSpace(string(ssh.MarshalAuthorizedKey(sshPubKey)))
+
+	// Create temp directory for key
+	tmpDir, err := os.MkdirTemp("", "recover-ssh-")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create temp dir: %w", err)
+	}
+
+	// Write private key in OpenSSH format
+	privKeyPath := filepath.Join(tmpDir, "id_ed25519")
+
+	// Marshal private key to OpenSSH format
+	pemBlock, err := ssh.MarshalPrivateKey(privKey, "")
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal private key: %w", err)
+	}
+
+	privKeyPEM := pem.EncodeToMemory(pemBlock)
+	if err := os.WriteFile(privKeyPath, privKeyPEM, 0600); err != nil {
+		return nil, fmt.Errorf("failed to write private key: %w", err)
+	}
+
+	return &SSHKeyPair{
+		PrivateKeyPath: privKeyPath,
+		PublicKey:      pubKeyStr,
+	}, nil
+}
+
+// Cleanup removes the ephemeral key files
+func (k *SSHKeyPair) Cleanup() {
+	if k.PrivateKeyPath != "" {
+		os.RemoveAll(filepath.Dir(k.PrivateKeyPath))
+	}
+}
+
+// runSSHKeyMerge merges original authorized_keys with ephemeral key
+func (p *Pipeline) runSSHKeyMerge(ctx context.Context) error {
+	// First, backup current authorized_keys
+	backupCmd := "cp /root/.ssh/authorized_keys /root/.ssh/authorized_keys.ephemeral 2>/dev/null || true"
+	p.remoteCmd(ctx, backupCmd)
+
+	// Check if we have original keys in the restored /root
+	checkCmd := "cat /root/.ssh/authorized_keys.original 2>/dev/null || cat /srv/restore/root/.ssh/authorized_keys 2>/dev/null || echo ''"
+	originalKeys, _ := p.remoteCmdOutput(ctx, checkCmd)
+
+	// Get current (ephemeral) keys
+	currentKeys, _ := p.remoteCmdOutput(ctx, "cat /root/.ssh/authorized_keys 2>/dev/null || echo ''")
+
+	// Merge keys (unique)
+	allKeys := make(map[string]bool)
+	for _, key := range strings.Split(currentKeys, "\n") {
+		key = strings.TrimSpace(key)
+		if key != "" && !strings.HasPrefix(key, "#") {
+			allKeys[key] = true
+		}
+	}
+	for _, key := range strings.Split(originalKeys, "\n") {
+		key = strings.TrimSpace(key)
+		if key != "" && !strings.HasPrefix(key, "#") {
+			allKeys[key] = true
+		}
+	}
+
+	// Write merged keys
+	var mergedKeys []string
+	for key := range allKeys {
+		mergedKeys = append(mergedKeys, key)
+	}
+
+	mergeCmd := fmt.Sprintf("mkdir -p /root/.ssh && echo '%s' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys",
+		strings.Join(mergedKeys, "\n"))
+
+	return p.remoteCmd(ctx, mergeCmd)
+}
--- a/internal/restore/stages.go
+++ b/internal/restore/stages.go
@@ -0,0 +1,109 @@
+package restore
+
+import (
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+// /etc whitelist - only these are restored
+var etcWhitelist = []string{
+	"wireguard",
+	"letsencrypt",
+	"nginx",
+	"rsyslog-certs",
+	"systemd/system",
+	"docker",
+	"hostname",
+	"hosts",
+	"passwd",
+	"group",
+	"shadow",
+	"gshadow",
+}
+
+// runSync syncs /root and /opt from backup
+func (p *Pipeline) runSync(ctx context.Context) error {
+	dirs := []string{"root", "opt"}
+	return p.BackupSource.SyncTo(ctx, p.HostName, p.sshTarget(), p.SSHKeyPath, dirs)
+}
+
+// runEtcStaging stages /etc to /srv/restore/etc
+func (p *Pipeline) runEtcStaging(ctx context.Context) error {
+	// Create staging directory on target
+	if err := p.remoteCmd(ctx, "mkdir -p /srv/restore"); err != nil {
+		return fmt.Errorf("failed to create staging dir: %w", err)
+	}
+
+	// Sync /etc to staging
+	dirs := []string{"etc"}
+	return p.BackupSource.SyncTo(ctx, p.HostName, p.sshTarget(), p.SSHKeyPath, dirs)
+}
+
+// runSelectiveEtc copies only whitelisted items from staged /etc
+func (p *Pipeline) runSelectiveEtc(ctx context.Context) error {
+	for _, item := range etcWhitelist {
+		src := fmt.Sprintf("/srv/restore/etc/%s", item)
+		dst := fmt.Sprintf("/etc/%s", item)
+
+		// Check if source exists
+		checkCmd := fmt.Sprintf("test -e %s && echo exists || echo missing", src)
+		output, err := p.remoteCmdOutput(ctx, checkCmd)
+		if err != nil || strings.TrimSpace(output) == "missing" {
+			if p.Verbose {
+				fmt.Printf("  Skipping %s (not in backup)\n", item)
+			}
+			continue
+		}
+
+		// Create parent directory if needed
+		parentDir := fmt.Sprintf("/etc/%s", strings.Split(item, "/")[0])
+		if strings.Contains(item, "/") {
+			p.remoteCmd(ctx, fmt.Sprintf("mkdir -p %s", parentDir))
+		}
+
+		// Copy with rsync for proper permissions
+		copyCmd := fmt.Sprintf("rsync -av %s %s", src, dst)
+		if err := p.remoteCmd(ctx, copyCmd); err != nil {
+			return fmt.Errorf("failed to restore %s: %w", item, err)
+		}
+
+		if p.Verbose {
+			fmt.Printf("  Restored %s\n", item)
+		}
+	}
+
+	return nil
+}
+
+// remoteCmd runs a command on the target VM
+func (p *Pipeline) remoteCmd(ctx context.Context, cmd string) error {
+	sshArgs := []string{
+		"-i", p.SSHKeyPath,
+		"-o", "StrictHostKeyChecking=no",
+		"-o", "UserKnownHostsFile=/dev/null",
+		"-o", "ConnectTimeout=10",
+		p.sshTarget(),
+		cmd,
+	}
+
+	sshCmd := exec.CommandContext(ctx, "ssh", sshArgs...)
+	return sshCmd.Run()
+}
+
+// remoteCmdOutput runs a command and returns output
+func (p *Pipeline) remoteCmdOutput(ctx context.Context, cmd string) (string, error) {
+	sshArgs := []string{
+		"-i", p.SSHKeyPath,
+		"-o", "StrictHostKeyChecking=no",
+		"-o", "UserKnownHostsFile=/dev/null",
+		"-o", "ConnectTimeout=10",
+		p.sshTarget(),
+		cmd,
+	}
+
+	sshCmd := exec.CommandContext(ctx, "ssh", sshArgs...)
+	output, err := sshCmd.Output()
+	return string(output), err
+}
--- a/internal/restore/wireguard.go
+++ b/internal/restore/wireguard.go
@@ -0,0 +1,63 @@
+package restore
+
+import (
+	"context"
+	"fmt"
+	"strings"
+)
+
+// runServices starts restored services
+func (p *Pipeline) runServices(ctx context.Context) error {
+	// Start WireGuard interfaces
+	if err := p.startWireGuard(ctx); err != nil {
+		// WireGuard is optional, log but don't fail
+		if p.Verbose {
+			fmt.Printf("  WireGuard: %v\n", err)
+		}
+	}
+
+	// Start Docker
+	if err := p.startDocker(ctx); err != nil {
+		return fmt.Errorf("failed to start Docker: %w", err)
+	}
+
+	return nil
+}
+
+// startWireGuard enables and starts WireGuard interfaces
+func (p *Pipeline) startWireGuard(ctx context.Context) error {
+	// Check if WireGuard configs exist
+	checkCmd := "ls /etc/wireguard/*.conf 2>/dev/null | head -5"
+	output, err := p.remoteCmdOutput(ctx, checkCmd)
+	if err != nil || strings.TrimSpace(output) == "" {
+		return fmt.Errorf("no WireGuard configs found")
+	}
+
+	// Get interface names
+	configs := strings.Split(strings.TrimSpace(output), "\n")
+	for _, conf := range configs {
+		if conf == "" {
+			continue
+		}
+
+		// Extract interface name from path (e.g., /etc/wireguard/wg0.conf -> wg0)
+		parts := strings.Split(conf, "/")
+		filename := parts[len(parts)-1]
+		iface := strings.TrimSuffix(filename, ".conf")
+
+		if p.Verbose {
+			fmt.Printf("  Starting WireGuard interface: %s\n", iface)
+		}
+
+		// Enable and start
+		enableCmd := fmt.Sprintf("systemctl enable wg-quick@%s", iface)
+		startCmd := fmt.Sprintf("systemctl start wg-quick@%s", iface)
+
+		p.remoteCmd(ctx, enableCmd)
+		if err := p.remoteCmd(ctx, startCmd); err != nil {
+			return fmt.Errorf("failed to start %s: %w", iface, err)
+		}
+	}
+
+	return nil
+}