#!/usr/bin/env python3 # /// script # dependencies = [ # "pandas>=2.0.0", # "matplotlib>=3.7.0", # "numpy>=1.24.0", # ] # /// """ Rolling Sum Analysis Tool Analyzes CSV output from the GStreamer rollingsum plugin Usage: uv run analyze_sma.py [csv_file] """ import sys import pandas as pd import matplotlib.pyplot as plt import numpy as np from pathlib import Path from datetime import datetime import shutil def analyze_csv(csv_file: str = "output.csv"): """Analyze the rolling sum CSV data and generate insights.""" # Create output directory output_dir = Path("results/debug") output_dir.mkdir(parents=True, exist_ok=True) # Read the CSV try: df = pd.read_csv(csv_file) except FileNotFoundError: print(f"Error: CSV file '{csv_file}' not found.") sys.exit(1) # Copy input CSV to results directory with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_name = Path(csv_file).stem archived_csv = output_dir / f"{csv_name}_{timestamp}.csv" shutil.copy(csv_file, archived_csv) print("=" * 80) print(f"ROLLING SUM ANALYSIS - {csv_file}") print("=" * 80) print() # Basic statistics print("DATASET OVERVIEW:") print(f" Total frames: {len(df)}") print(f" Frames dropped: {df['dropped'].sum()}") print(f" Frames kept: {(df['dropped'] == 0).sum()}") print(f" Drop rate: {df['dropped'].mean() * 100:.2f}%") print() # Column mean statistics print("COLUMN MEAN STATISTICS:") print(f" Min: {df['column_mean'].min():.6f}") print(f" Max: {df['column_mean'].max():.6f}") print(f" Range: {df['column_mean'].max() - df['column_mean'].min():.6f}") print(f" Mean: {df['column_mean'].mean():.6f}") print(f" Std Dev: {df['column_mean'].std():.6f}") print() # Deviation statistics print("DEVIATION STATISTICS:") print(f" Min deviation: {df['deviation'].min():.6f}") print(f" Max deviation: {df['deviation'].max():.6f}") print(f" Mean deviation: {df['deviation'].mean():.6f}") print(f" Std dev of deviations: {df['deviation'].std():.6f}") print() # Normalized deviation statistics print("NORMALIZED DEVIATION STATISTICS:") print(f" Min: {df['normalized_deviation'].min():.8f}") print(f" Max: {df['normalized_deviation'].max():.8f}") print(f" Mean: {df['normalized_deviation'].mean():.8f}") print(f" Median: {df['normalized_deviation'].median():.8f}") print(f" 95th percentile: {df['normalized_deviation'].quantile(0.95):.8f}") print(f" 99th percentile: {df['normalized_deviation'].quantile(0.99):.8f}") print() # Threshold recommendations print("THRESHOLD RECOMMENDATIONS:") print(" (Based on normalized deviation percentiles)") print() percentiles = [50, 75, 90, 95, 99] for p in percentiles: threshold = df['normalized_deviation'].quantile(p / 100) frames_dropped = (df['normalized_deviation'] > threshold).sum() drop_rate = (frames_dropped / len(df)) * 100 print(f" {p}th percentile: threshold={threshold:.8f}") print(f" → Would drop {frames_dropped} frames ({drop_rate:.1f}%)") print() # Suggest optimal thresholds based on standard deviations mean_norm_dev = df['normalized_deviation'].mean() std_norm_dev = df['normalized_deviation'].std() print("STANDARD DEVIATION-BASED THRESHOLDS:") for n in [1, 2, 3]: threshold = mean_norm_dev + (n * std_norm_dev) frames_dropped = (df['normalized_deviation'] > threshold).sum() drop_rate = (frames_dropped / len(df)) * 100 print(f" Mean + {n}σ: threshold={threshold:.8f}") print(f" → Would drop {frames_dropped} frames ({drop_rate:.1f}%)") print() # Create visualizations plot_file = create_plots(df, csv_file, output_dir, timestamp) print("=" * 80) print("OUTPUT FILES:") print(f" CSV Archive: {archived_csv}") print(f" Analysis Plot: {plot_file}") print("=" * 80) def create_plots(df: pd.DataFrame, csv_file: str, output_dir: Path, timestamp: str) -> Path: """Create analysis plots and return the output file path.""" fig, axes = plt.subplots(2, 2, figsize=(14, 10)) fig.suptitle(f'Rolling Sum Analysis - {csv_file}', fontsize=16, fontweight='bold') # Plot 1: Column mean over time ax1 = axes[0, 0] ax1.plot(df['frame'], df['column_mean'], label='Column Mean', linewidth=1) ax1.plot(df['frame'], df['rolling_mean'], label='Rolling Mean', linewidth=1, alpha=0.7) ax1.set_xlabel('Frame') ax1.set_ylabel('Pixel Value') ax1.set_title('Column Mean vs Rolling Mean') ax1.legend() ax1.grid(True, alpha=0.3) # Plot 2: Deviation over time ax2 = axes[0, 1] ax2.plot(df['frame'], df['deviation'], linewidth=1, color='orange') ax2.axhline(y=df['deviation'].mean(), color='r', linestyle='--', label=f'Mean: {df["deviation"].mean():.4f}') ax2.axhline(y=df['deviation'].quantile(0.95), color='g', linestyle='--', label=f'95th: {df["deviation"].quantile(0.95):.4f}') ax2.set_xlabel('Frame') ax2.set_ylabel('Absolute Deviation') ax2.set_title('Deviation from Rolling Mean') ax2.legend() ax2.grid(True, alpha=0.3) # Plot 3: Normalized deviation distribution ax3 = axes[1, 0] ax3.hist(df['normalized_deviation'], bins=50, edgecolor='black', alpha=0.7) ax3.axvline(x=df['normalized_deviation'].mean(), color='r', linestyle='--', label=f'Mean: {df["normalized_deviation"].mean():.6f}') ax3.axvline(x=df['normalized_deviation'].median(), color='g', linestyle='--', label=f'Median: {df["normalized_deviation"].median():.6f}') ax3.set_xlabel('Normalized Deviation') ax3.set_ylabel('Frequency') ax3.set_title('Normalized Deviation Distribution') ax3.legend() ax3.grid(True, alpha=0.3, axis='y') # Plot 4: Cumulative distribution ax4 = axes[1, 1] sorted_norm_dev = np.sort(df['normalized_deviation']) cumulative = np.arange(1, len(sorted_norm_dev) + 1) / len(sorted_norm_dev) * 100 ax4.plot(sorted_norm_dev, cumulative, linewidth=2) # Mark percentiles for p in [50, 75, 90, 95, 99]: threshold = df['normalized_deviation'].quantile(p / 100) ax4.axvline(x=threshold, color='red', linestyle=':', alpha=0.5) ax4.text(threshold, p, f'{p}th', rotation=90, va='bottom', ha='right', fontsize=8) ax4.set_xlabel('Normalized Deviation') ax4.set_ylabel('Cumulative Percentage (%)') ax4.set_title('Cumulative Distribution Function') ax4.grid(True, alpha=0.3) plt.tight_layout() # Save the plot to results/debug csv_name = Path(csv_file).stem output_file = output_dir / f"{csv_name}_analysis_{timestamp}.png" plt.savefig(output_file, dpi=150, bbox_inches='tight') print(f"\nāœ“ Saved analysis plot to: {output_file}\n") return output_file if __name__ == "__main__": csv_file = sys.argv[1] if len(sys.argv) > 1 else "output.csv" if not Path(csv_file).exists(): print(f"Error: File '{csv_file}' not found.") print(f"Usage: uv run analyze_sma.py [csv_file]") sys.exit(1) analyze_csv(csv_file)