gst-plugin-linescan/analyze_sma.py
yair 44083222ee refactor: Output analysis results to results/debug directory
- Update analyze_sma.py to save files to results/debug/
- Add timestamp to archived CSV and plot files
- Update .gitignore to exclude results/ directory
- Update ROLLINGSUM_GUIDE.md with new output locations
2025-11-14 14:34:58 +02:00

200 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# /// script
# dependencies = [
# "pandas>=2.0.0",
# "matplotlib>=3.7.0",
# "numpy>=1.24.0",
# ]
# ///
"""
Rolling Sum Analysis Tool
Analyzes CSV output from the GStreamer rollingsum plugin
Usage: uv run analyze_sma.py [csv_file]
"""
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from datetime import datetime
import shutil
def analyze_csv(csv_file: str = "output.csv"):
"""Analyze the rolling sum CSV data and generate insights."""
# Create output directory
output_dir = Path("results/debug")
output_dir.mkdir(parents=True, exist_ok=True)
# Read the CSV
try:
df = pd.read_csv(csv_file)
except FileNotFoundError:
print(f"Error: CSV file '{csv_file}' not found.")
sys.exit(1)
# Copy input CSV to results directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_name = Path(csv_file).stem
archived_csv = output_dir / f"{csv_name}_{timestamp}.csv"
shutil.copy(csv_file, archived_csv)
print("=" * 80)
print(f"ROLLING SUM ANALYSIS - {csv_file}")
print("=" * 80)
print()
# Basic statistics
print("DATASET OVERVIEW:")
print(f" Total frames: {len(df)}")
print(f" Frames dropped: {df['dropped'].sum()}")
print(f" Frames kept: {(df['dropped'] == 0).sum()}")
print(f" Drop rate: {df['dropped'].mean() * 100:.2f}%")
print()
# Column mean statistics
print("COLUMN MEAN STATISTICS:")
print(f" Min: {df['column_mean'].min():.6f}")
print(f" Max: {df['column_mean'].max():.6f}")
print(f" Range: {df['column_mean'].max() - df['column_mean'].min():.6f}")
print(f" Mean: {df['column_mean'].mean():.6f}")
print(f" Std Dev: {df['column_mean'].std():.6f}")
print()
# Deviation statistics
print("DEVIATION STATISTICS:")
print(f" Min deviation: {df['deviation'].min():.6f}")
print(f" Max deviation: {df['deviation'].max():.6f}")
print(f" Mean deviation: {df['deviation'].mean():.6f}")
print(f" Std dev of deviations: {df['deviation'].std():.6f}")
print()
# Normalized deviation statistics
print("NORMALIZED DEVIATION STATISTICS:")
print(f" Min: {df['normalized_deviation'].min():.8f}")
print(f" Max: {df['normalized_deviation'].max():.8f}")
print(f" Mean: {df['normalized_deviation'].mean():.8f}")
print(f" Median: {df['normalized_deviation'].median():.8f}")
print(f" 95th percentile: {df['normalized_deviation'].quantile(0.95):.8f}")
print(f" 99th percentile: {df['normalized_deviation'].quantile(0.99):.8f}")
print()
# Threshold recommendations
print("THRESHOLD RECOMMENDATIONS:")
print(" (Based on normalized deviation percentiles)")
print()
percentiles = [50, 75, 90, 95, 99]
for p in percentiles:
threshold = df['normalized_deviation'].quantile(p / 100)
frames_dropped = (df['normalized_deviation'] > threshold).sum()
drop_rate = (frames_dropped / len(df)) * 100
print(f" {p}th percentile: threshold={threshold:.8f}")
print(f" → Would drop {frames_dropped} frames ({drop_rate:.1f}%)")
print()
# Suggest optimal thresholds based on standard deviations
mean_norm_dev = df['normalized_deviation'].mean()
std_norm_dev = df['normalized_deviation'].std()
print("STANDARD DEVIATION-BASED THRESHOLDS:")
for n in [1, 2, 3]:
threshold = mean_norm_dev + (n * std_norm_dev)
frames_dropped = (df['normalized_deviation'] > threshold).sum()
drop_rate = (frames_dropped / len(df)) * 100
print(f" Mean + {n}σ: threshold={threshold:.8f}")
print(f" → Would drop {frames_dropped} frames ({drop_rate:.1f}%)")
print()
# Create visualizations
plot_file = create_plots(df, csv_file, output_dir, timestamp)
print("=" * 80)
print("OUTPUT FILES:")
print(f" CSV Archive: {archived_csv}")
print(f" Analysis Plot: {plot_file}")
print("=" * 80)
def create_plots(df: pd.DataFrame, csv_file: str, output_dir: Path, timestamp: str) -> Path:
"""Create analysis plots and return the output file path."""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle(f'Rolling Sum Analysis - {csv_file}', fontsize=16, fontweight='bold')
# Plot 1: Column mean over time
ax1 = axes[0, 0]
ax1.plot(df['frame'], df['column_mean'], label='Column Mean', linewidth=1)
ax1.plot(df['frame'], df['rolling_mean'], label='Rolling Mean', linewidth=1, alpha=0.7)
ax1.set_xlabel('Frame')
ax1.set_ylabel('Pixel Value')
ax1.set_title('Column Mean vs Rolling Mean')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Plot 2: Deviation over time
ax2 = axes[0, 1]
ax2.plot(df['frame'], df['deviation'], linewidth=1, color='orange')
ax2.axhline(y=df['deviation'].mean(), color='r', linestyle='--',
label=f'Mean: {df["deviation"].mean():.4f}')
ax2.axhline(y=df['deviation'].quantile(0.95), color='g', linestyle='--',
label=f'95th: {df["deviation"].quantile(0.95):.4f}')
ax2.set_xlabel('Frame')
ax2.set_ylabel('Absolute Deviation')
ax2.set_title('Deviation from Rolling Mean')
ax2.legend()
ax2.grid(True, alpha=0.3)
# Plot 3: Normalized deviation distribution
ax3 = axes[1, 0]
ax3.hist(df['normalized_deviation'], bins=50, edgecolor='black', alpha=0.7)
ax3.axvline(x=df['normalized_deviation'].mean(), color='r', linestyle='--',
label=f'Mean: {df["normalized_deviation"].mean():.6f}')
ax3.axvline(x=df['normalized_deviation'].median(), color='g', linestyle='--',
label=f'Median: {df["normalized_deviation"].median():.6f}')
ax3.set_xlabel('Normalized Deviation')
ax3.set_ylabel('Frequency')
ax3.set_title('Normalized Deviation Distribution')
ax3.legend()
ax3.grid(True, alpha=0.3, axis='y')
# Plot 4: Cumulative distribution
ax4 = axes[1, 1]
sorted_norm_dev = np.sort(df['normalized_deviation'])
cumulative = np.arange(1, len(sorted_norm_dev) + 1) / len(sorted_norm_dev) * 100
ax4.plot(sorted_norm_dev, cumulative, linewidth=2)
# Mark percentiles
for p in [50, 75, 90, 95, 99]:
threshold = df['normalized_deviation'].quantile(p / 100)
ax4.axvline(x=threshold, color='red', linestyle=':', alpha=0.5)
ax4.text(threshold, p, f'{p}th', rotation=90, va='bottom', ha='right', fontsize=8)
ax4.set_xlabel('Normalized Deviation')
ax4.set_ylabel('Cumulative Percentage (%)')
ax4.set_title('Cumulative Distribution Function')
ax4.grid(True, alpha=0.3)
plt.tight_layout()
# Save the plot to results/debug
csv_name = Path(csv_file).stem
output_file = output_dir / f"{csv_name}_analysis_{timestamp}.png"
plt.savefig(output_file, dpi=150, bbox_inches='tight')
print(f"\n✓ Saved analysis plot to: {output_file}\n")
return output_file
if __name__ == "__main__":
csv_file = sys.argv[1] if len(sys.argv) > 1 else "output.csv"
if not Path(csv_file).exists():
print(f"Error: File '{csv_file}' not found.")
print(f"Usage: uv run analyze_sma.py [csv_file]")
sys.exit(1)
analyze_csv(csv_file)