Files
MSCS532_Assignment7/examples/generate_plots.py
2025-11-24 17:11:51 -05:00

390 lines
16 KiB
Python

"""
Generate visualization plots for hash table performance analysis.
"""
import sys
import os
import matplotlib.pyplot as plt
import numpy as np
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from src.benchmark import (
benchmark_hash_functions,
benchmark_open_addressing_vs_chaining,
benchmark_load_factor_impact,
benchmark_load_factor_impact_probes,
generate_test_data
)
from src.hash_functions import (
division_hash,
multiplication_hash,
string_hash_simple,
string_hash_polynomial,
string_hash_djb2,
bad_hash_clustering
)
def plot_hash_function_comparison():
"""Compare different hash functions."""
print("Generating hash function comparison plot...")
keys = generate_test_data(1000)
table_size = 100
hash_funcs = {
'Division': division_hash,
'Multiplication': lambda k, s: multiplication_hash(k, s),
'Simple String': lambda k, s: string_hash_simple(str(k), s),
'Polynomial String': lambda k, s: string_hash_polynomial(str(k), s),
'DJB2': lambda k, s: string_hash_djb2(str(k), s),
'Bad Clustering': bad_hash_clustering,
}
results = benchmark_hash_functions(hash_funcs, keys, table_size)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Hash Function Performance Comparison', fontsize=16, fontweight='bold')
names = list(results.keys())
collision_rates = [results[n]['collision_rate'] * 100 for n in names]
variances = [results[n]['variance'] for n in names]
times = [results[n]['time'] * 1000 for n in names] # Convert to ms
max_chains = [results[n]['max_chain_length'] for n in names]
# Collision rate
axes[0, 0].bar(names, collision_rates, color='steelblue')
axes[0, 0].set_title('Collision Rate (%)', fontweight='bold')
axes[0, 0].set_ylabel('Collision Rate (%)')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)
# Variance (distribution quality)
axes[0, 1].bar(names, variances, color='coral')
axes[0, 1].set_title('Distribution Variance (Lower is Better)', fontweight='bold')
axes[0, 1].set_ylabel('Variance')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)
# Execution time
axes[1, 0].bar(names, times, color='mediumseagreen')
axes[1, 0].set_title('Execution Time', fontweight='bold')
axes[1, 0].set_ylabel('Time (ms)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)
# Max chain length
axes[1, 1].bar(names, max_chains, color='plum')
axes[1, 1].set_title('Maximum Chain Length', fontweight='bold')
axes[1, 1].set_ylabel('Max Chain Length')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'hash_function_comparison.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_open_addressing_vs_chaining():
"""Compare open addressing vs separate chaining."""
print("Generating open addressing vs separate chaining comparison plot...")
sizes = [100, 500, 1000, 5000, 10000]
results = benchmark_open_addressing_vs_chaining(sizes)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Open Addressing vs Separate Chaining Performance', fontsize=16, fontweight='bold')
sizes_arr = np.array(sizes)
# Insert time
ax = axes[0, 0]
for probe_type in ['linear', 'quadratic', 'double']:
insert_times = [r['insert_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, insert_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
insert_times_sc = [r['insert_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, insert_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Insert Time (seconds)')
ax.set_title('Insert Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Search time
ax = axes[0, 1]
for probe_type in ['linear', 'quadratic', 'double']:
search_times = [r['search_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, search_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
search_times_sc = [r['search_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, search_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Search Time (seconds)')
ax.set_title('Search Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Delete time
ax = axes[1, 0]
for probe_type in ['linear', 'quadratic', 'double']:
delete_times = [r['delete_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, delete_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
delete_times_sc = [r['delete_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, delete_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Delete Time (seconds)')
ax.set_title('Delete Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Load factors
ax = axes[1, 1]
for probe_type in ['linear', 'quadratic', 'double']:
load_factors = [r['load_factor'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, load_factors, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
load_factors_sc = [r['load_factor'] for r in results['separate_chaining']]
ax.plot(sizes_arr, load_factors_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Load Factor')
ax.set_title('Load Factor', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'open_addressing_vs_chaining.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_load_factor_impact():
"""Plot performance at different load factors with statistical smoothing."""
print("Generating load factor impact plot...")
results = benchmark_load_factor_impact(initial_size=100, max_elements=1000, probe_type='linear', num_runs=30)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Performance Impact of Load Factor', fontsize=16, fontweight='bold')
# Extract data
oa_data = results['open_addressing']
sc_data = results['separate_chaining']
# Sort by load factor to avoid zig-zag lines
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
oa_load_factors = [r['load_factor'] for r in oa_sorted]
oa_insert_times = [r['insert_time'] for r in oa_sorted]
oa_insert_stds = [r.get('insert_time_std', 0) for r in oa_sorted]
oa_search_times = [r['search_time'] for r in oa_sorted]
oa_search_stds = [r.get('search_time_std', 0) for r in oa_sorted]
sc_load_factors = [r['load_factor'] for r in sc_sorted]
sc_insert_times = [r['insert_time'] for r in sc_sorted]
sc_insert_stds = [r.get('insert_time_std', 0) for r in sc_sorted]
sc_search_times = [r['search_time'] for r in sc_sorted]
sc_search_stds = [r.get('search_time_std', 0) for r in sc_sorted]
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
# Insert time vs load factor (per element) with error bars
ax = axes[0]
ax.errorbar(oa_load_factors, oa_insert_times, yerr=oa_insert_stds,
marker='o', label='Open Addressing (Linear)', linewidth=2,
capsize=3, capthick=1.5, alpha=0.8)
ax.errorbar(sc_load_factors, sc_insert_times, yerr=sc_insert_stds,
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
capsize=3, capthick=1.5, alpha=0.8)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Insert Time per Element (seconds)')
ax.set_title('Insert Time vs Load Factor', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search time vs load factor (per element) with error bars
ax = axes[1]
ax.errorbar(oa_load_factors, oa_search_times, yerr=oa_search_stds,
marker='o', label='Open Addressing (Linear)', linewidth=2,
capsize=3, capthick=1.5, alpha=0.8)
ax.errorbar(sc_load_factors, sc_search_times, yerr=sc_search_stds,
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
capsize=3, capthick=1.5, alpha=0.8)
ax2 = ax.twinx()
# Chain length is smooth and accurate, so use line plot
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^',
label='Avg Chain Length (SC)', color='green', linestyle=':', linewidth=2)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Search Time per Element (seconds)', color='blue')
ax2.set_ylabel('Average Chain Length', color='green')
ax.set_title('Search Time vs Load Factor', fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_load_factor_impact_probes():
"""Plot probe counts and comparisons at different load factors.
Uses deterministic metrics (probe counts, comparisons) instead of timing
to produce smooth theoretical curves without measurement noise.
"""
print("Generating load factor impact plot (probe counts)...")
results = benchmark_load_factor_impact_probes(initial_size=100, max_elements=1000, probe_type='linear', num_runs=10)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Performance Impact of Load Factor (Probe Counts & Comparisons)', fontsize=16, fontweight='bold')
# Extract data
oa_data = results['open_addressing']
sc_data = results['separate_chaining']
# Sort by load factor
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
oa_load_factors = [r['load_factor'] for r in oa_sorted]
oa_insert_probes = [r['insert_probes_per_element'] for r in oa_sorted]
oa_search_probes = [r['search_probes_per_element'] for r in oa_sorted]
oa_insert_comparisons = [r['insert_comparisons_per_element'] for r in oa_sorted]
oa_search_comparisons = [r['search_comparisons_per_element'] for r in oa_sorted]
sc_load_factors = [r['load_factor'] for r in sc_sorted]
sc_insert_comparisons = [r['insert_comparisons_per_element'] for r in sc_sorted]
sc_search_comparisons = [r['search_comparisons_per_element'] for r in sc_sorted]
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
# Insert probes per element (Open Addressing)
ax = axes[0, 0]
ax.plot(oa_load_factors, oa_insert_probes, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Probes per Element')
ax.set_title('Insert: Probes per Element (Open Addressing)', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search probes per element (Open Addressing)
ax = axes[0, 1]
ax.plot(oa_load_factors, oa_search_probes, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Probes per Element')
ax.set_title('Search: Probes per Element (Open Addressing)', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Comparisons per element (both methods)
ax = axes[1, 0]
ax.plot(oa_load_factors, oa_insert_comparisons, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.plot(sc_load_factors, sc_insert_comparisons, marker='s', label='Separate Chaining',
linewidth=2, linestyle='--', color='orange', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Comparisons per Element')
ax.set_title('Insert: Comparisons per Element', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search comparisons per element and chain length
ax = axes[1, 1]
ax.plot(oa_load_factors, oa_search_comparisons, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.plot(sc_load_factors, sc_search_comparisons, marker='s', label='Separate Chaining',
linewidth=2, linestyle='--', color='orange', markersize=6)
ax2 = ax.twinx()
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^', label='Avg Chain Length (SC)',
color='green', linestyle=':', linewidth=2, markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Comparisons per Element', color='blue')
ax2.set_ylabel('Average Chain Length', color='green')
ax.set_title('Search: Comparisons per Element', fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact_probes.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_collision_analysis():
"""Plot collision analysis for different hash functions."""
print("Generating collision analysis plot...")
keys = generate_test_data(500)
table_size = 100
hash_funcs = {
'Division': division_hash,
'Multiplication': lambda k, s: multiplication_hash(k, s),
'Simple String': lambda k, s: string_hash_simple(str(k), s),
'Polynomial': lambda k, s: string_hash_polynomial(str(k), s),
'Bad Clustering': bad_hash_clustering,
}
results = benchmark_hash_functions(hash_funcs, keys, table_size)
fig, ax = plt.subplots(figsize=(12, 6))
names = list(results.keys())
collision_counts = [results[n]['collisions'] for n in names]
colors = ['steelblue' if 'Bad' not in n else 'coral' for n in names]
bars = ax.bar(names, collision_counts, color=colors)
ax.set_xlabel('Hash Function', fontweight='bold')
ax.set_ylabel('Number of Collisions', fontweight='bold')
ax.set_title('Collision Analysis: Good vs Bad Hash Functions', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
ax.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'collision_analysis.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
if __name__ == "__main__":
# Ensure docs directory exists
os.makedirs(os.path.join(os.path.dirname(__file__), '..', 'docs'), exist_ok=True)
print("Generating visualization plots...")
print("This may take a few minutes...\n")
plot_hash_function_comparison()
plot_open_addressing_vs_chaining()
plot_load_factor_impact()
plot_load_factor_impact_probes()
plot_collision_analysis()
print("\nAll plots generated successfully!")