Introduce script/histogram to produce before/after comparisons

Antonio Scandurra created

Change summary

script/histogram | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)

Detailed changes

script/histogram 🔗

@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# This script is designed to parse log files for performance measurements and create histograms of these measurements.
+# It expects log files to contain lines with measurements in the format "measurement: timeunit" where timeunit can be in milliseconds (ms) or microseconds (µs).
+# Lines that do not contain a colon ':' are skipped.
+# The script takes one or more file paths as command-line arguments, parses each log file, and then combines the data into a single DataFrame.
+# It then converts all time measurements into milliseconds, discards the original time and unit columns, and creates histograms for each unique measurement type.
+# The histograms display the distribution of times for each measurement, separated by log file, and normalized to show density rather than count.
+# To use this script, run it from the command line with the log file paths as arguments, like so:
+# python this_script.py log1.txt log2.txt ...
+# The script will then parse the provided log files and display the histograms for each type of measurement found.
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import sys
+
+def parse_log_file(file_path):
+    data = {'measurement': [], 'time': [], 'unit': [], 'log_file': []}
+    with open(file_path, 'r') as file:
+        for line in file:
+            if ':' not in line:
+                continue
+
+            parts = line.strip().split(': ')
+            if len(parts) != 2:
+                continue
+
+            measurement, time_with_unit = parts[0], parts[1]
+            if 'ms' in time_with_unit:
+                time, unit = time_with_unit[:-2], 'ms'
+            elif 'µs' in time_with_unit:
+                time, unit = time_with_unit[:-2], 'µs'
+            else:
+                raise ValueError(f"Invalid time unit in line: {line.strip()}")
+                continue
+
+            data['measurement'].append(measurement)
+            data['time'].append(float(time))
+            data['unit'].append(unit)
+            data['log_file'].append(file_path.split('/')[-1])
+    return pd.DataFrame(data)
+
+def create_histograms(df, measurement):
+    filtered_df = df[df['measurement'] == measurement]
+    plt.figure(figsize=(12, 6))
+    sns.histplot(data=filtered_df, x='time_ms', hue='log_file', element='step', stat='density', common_norm=False, palette='bright')
+    plt.title(f'Histogram of {measurement}')
+    plt.xlabel('Time (ms)')
+    plt.ylabel('Density')
+    plt.grid(True)
+    plt.xlim(filtered_df['time_ms'].quantile(0.01), filtered_df['time_ms'].quantile(0.99))
+    plt.show()
+
+
+file_paths = sys.argv[1:]
+dfs = [parse_log_file(path) for path in file_paths]
+combined_df = pd.concat(dfs, ignore_index=True)
+combined_df['time_ms'] = combined_df.apply(lambda row: row['time'] if row['unit'] == 'ms' else row['time'] / 1000, axis=1)
+combined_df.drop(['time', 'unit'], axis=1, inplace=True)
+
+measurement_types = combined_df['measurement'].unique()
+for measurement in measurement_types:
+    create_histograms(combined_df, measurement)