cmd/delete-duplicates/main.go

package main

import (
	"database/sql"
	"fmt"
	"log"
	"os"

	_ "github.com/duckdb/duckdb-go/v2"
)

func main() {
	if len(os.Args) != 2 {
		fmt.Fprintf(os.Stderr, "Usage: %s <database_file>\n", os.Args[0])
		fmt.Fprintf(os.Stderr, "Example: %s honeypot.db\n", os.Args[0])
		os.Exit(1)
	}

	databaseFile := os.Args[1]

	// Open database connection
	db, err := sql.Open("duckdb", databaseFile)
	if err != nil {
		log.Fatalf("failed to open database: %v", err)
	}
	defer db.Close()

	// First, count how many duplicate entries exist (excluding the ones we'll keep)
	var duplicateCount int
	err = db.QueryRow(`
		SELECT COUNT(*) - (
			SELECT COUNT(*)
			FROM (
				SELECT DISTINCT time, remote_addr
				FROM honeypot_events
			)
		)
		FROM honeypot_events
	`).Scan(&duplicateCount)
	if err != nil {
		log.Fatalf("failed to count duplicates: %v", err)
	}

	if duplicateCount == 0 {
		fmt.Println("No duplicate entries found (where timestamp and remote_addr are the same)")
		return
	}

	// Count how many unique duplicate groups exist
	var duplicateGroups int
	err = db.QueryRow(`
		SELECT COUNT(*)
		FROM (
			SELECT time, remote_addr
			FROM honeypot_events
			GROUP BY time, remote_addr
			HAVING COUNT(*) > 1
		)
	`).Scan(&duplicateGroups)
	if err != nil {
		log.Fatalf("failed to count duplicate groups: %v", err)
	}

	// Ask for confirmation
	fmt.Printf("Found %d duplicate entries across %d duplicate groups\n", duplicateCount, duplicateGroups)
	fmt.Print("Are you sure you want to delete these duplicate entries? (yes/no): ")
	var confirmation string
	fmt.Scanln(&confirmation)

	if confirmation != "yes" {
		fmt.Println("Deletion cancelled.")
		return
	}

	// Delete duplicates, keeping the entry with the minimum ID for each (time, remote_addr) combination
	// This uses a subquery to identify which IDs to keep (MIN(id) per group), then deletes all others
	result, err := db.Exec(`
		DELETE FROM honeypot_events
		WHERE (time, remote_addr) IN (
			SELECT time, remote_addr
			FROM honeypot_events
			GROUP BY time, remote_addr
			HAVING COUNT(*) > 1
		)
		AND id NOT IN (
			SELECT MIN(id)
			FROM honeypot_events
			GROUP BY time, remote_addr
		)
	`)
	if err != nil {
		log.Fatalf("failed to delete duplicates: %v", err)
	}

	rowsAffected, err := result.RowsAffected()
	if err != nil {
		log.Fatalf("failed to get rows affected: %v", err)
	}

	fmt.Printf("Successfully deleted %d duplicate entries\n", rowsAffected)
}