390 lines
16 KiB
Python
390 lines
16 KiB
Python
"""
|
|
Generate visualization plots for hash table performance analysis.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
from src.benchmark import (
|
|
benchmark_hash_functions,
|
|
benchmark_open_addressing_vs_chaining,
|
|
benchmark_load_factor_impact,
|
|
benchmark_load_factor_impact_probes,
|
|
generate_test_data
|
|
)
|
|
from src.hash_functions import (
|
|
division_hash,
|
|
multiplication_hash,
|
|
string_hash_simple,
|
|
string_hash_polynomial,
|
|
string_hash_djb2,
|
|
bad_hash_clustering
|
|
)
|
|
|
|
|
|
def plot_hash_function_comparison():
|
|
"""Compare different hash functions."""
|
|
print("Generating hash function comparison plot...")
|
|
|
|
keys = generate_test_data(1000)
|
|
table_size = 100
|
|
|
|
hash_funcs = {
|
|
'Division': division_hash,
|
|
'Multiplication': lambda k, s: multiplication_hash(k, s),
|
|
'Simple String': lambda k, s: string_hash_simple(str(k), s),
|
|
'Polynomial String': lambda k, s: string_hash_polynomial(str(k), s),
|
|
'DJB2': lambda k, s: string_hash_djb2(str(k), s),
|
|
'Bad Clustering': bad_hash_clustering,
|
|
}
|
|
|
|
results = benchmark_hash_functions(hash_funcs, keys, table_size)
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
|
fig.suptitle('Hash Function Performance Comparison', fontsize=16, fontweight='bold')
|
|
|
|
names = list(results.keys())
|
|
collision_rates = [results[n]['collision_rate'] * 100 for n in names]
|
|
variances = [results[n]['variance'] for n in names]
|
|
times = [results[n]['time'] * 1000 for n in names] # Convert to ms
|
|
max_chains = [results[n]['max_chain_length'] for n in names]
|
|
|
|
# Collision rate
|
|
axes[0, 0].bar(names, collision_rates, color='steelblue')
|
|
axes[0, 0].set_title('Collision Rate (%)', fontweight='bold')
|
|
axes[0, 0].set_ylabel('Collision Rate (%)')
|
|
axes[0, 0].tick_params(axis='x', rotation=45)
|
|
axes[0, 0].grid(axis='y', alpha=0.3)
|
|
|
|
# Variance (distribution quality)
|
|
axes[0, 1].bar(names, variances, color='coral')
|
|
axes[0, 1].set_title('Distribution Variance (Lower is Better)', fontweight='bold')
|
|
axes[0, 1].set_ylabel('Variance')
|
|
axes[0, 1].tick_params(axis='x', rotation=45)
|
|
axes[0, 1].grid(axis='y', alpha=0.3)
|
|
|
|
# Execution time
|
|
axes[1, 0].bar(names, times, color='mediumseagreen')
|
|
axes[1, 0].set_title('Execution Time', fontweight='bold')
|
|
axes[1, 0].set_ylabel('Time (ms)')
|
|
axes[1, 0].tick_params(axis='x', rotation=45)
|
|
axes[1, 0].grid(axis='y', alpha=0.3)
|
|
|
|
# Max chain length
|
|
axes[1, 1].bar(names, max_chains, color='plum')
|
|
axes[1, 1].set_title('Maximum Chain Length', fontweight='bold')
|
|
axes[1, 1].set_ylabel('Max Chain Length')
|
|
axes[1, 1].tick_params(axis='x', rotation=45)
|
|
axes[1, 1].grid(axis='y', alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'hash_function_comparison.png')
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
|
print(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_open_addressing_vs_chaining():
|
|
"""Compare open addressing vs separate chaining."""
|
|
print("Generating open addressing vs separate chaining comparison plot...")
|
|
|
|
sizes = [100, 500, 1000, 5000, 10000]
|
|
results = benchmark_open_addressing_vs_chaining(sizes)
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
|
fig.suptitle('Open Addressing vs Separate Chaining Performance', fontsize=16, fontweight='bold')
|
|
|
|
sizes_arr = np.array(sizes)
|
|
|
|
# Insert time
|
|
ax = axes[0, 0]
|
|
for probe_type in ['linear', 'quadratic', 'double']:
|
|
insert_times = [r['insert_time'] for r in results['open_addressing'][probe_type]]
|
|
ax.plot(sizes_arr, insert_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
|
|
|
insert_times_sc = [r['insert_time'] for r in results['separate_chaining']]
|
|
ax.plot(sizes_arr, insert_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
|
ax.set_xlabel('Number of Elements')
|
|
ax.set_ylabel('Insert Time (seconds)')
|
|
ax.set_title('Insert Performance', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
ax.set_xscale('log')
|
|
ax.set_yscale('log')
|
|
|
|
# Search time
|
|
ax = axes[0, 1]
|
|
for probe_type in ['linear', 'quadratic', 'double']:
|
|
search_times = [r['search_time'] for r in results['open_addressing'][probe_type]]
|
|
ax.plot(sizes_arr, search_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
|
|
|
search_times_sc = [r['search_time'] for r in results['separate_chaining']]
|
|
ax.plot(sizes_arr, search_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
|
ax.set_xlabel('Number of Elements')
|
|
ax.set_ylabel('Search Time (seconds)')
|
|
ax.set_title('Search Performance', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
ax.set_xscale('log')
|
|
ax.set_yscale('log')
|
|
|
|
# Delete time
|
|
ax = axes[1, 0]
|
|
for probe_type in ['linear', 'quadratic', 'double']:
|
|
delete_times = [r['delete_time'] for r in results['open_addressing'][probe_type]]
|
|
ax.plot(sizes_arr, delete_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
|
|
|
delete_times_sc = [r['delete_time'] for r in results['separate_chaining']]
|
|
ax.plot(sizes_arr, delete_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
|
ax.set_xlabel('Number of Elements')
|
|
ax.set_ylabel('Delete Time (seconds)')
|
|
ax.set_title('Delete Performance', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
ax.set_xscale('log')
|
|
ax.set_yscale('log')
|
|
|
|
# Load factors
|
|
ax = axes[1, 1]
|
|
for probe_type in ['linear', 'quadratic', 'double']:
|
|
load_factors = [r['load_factor'] for r in results['open_addressing'][probe_type]]
|
|
ax.plot(sizes_arr, load_factors, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
|
|
|
load_factors_sc = [r['load_factor'] for r in results['separate_chaining']]
|
|
ax.plot(sizes_arr, load_factors_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
|
ax.set_xlabel('Number of Elements')
|
|
ax.set_ylabel('Load Factor')
|
|
ax.set_title('Load Factor', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
ax.set_xscale('log')
|
|
|
|
plt.tight_layout()
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'open_addressing_vs_chaining.png')
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
|
print(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_load_factor_impact():
|
|
"""Plot performance at different load factors with statistical smoothing."""
|
|
print("Generating load factor impact plot...")
|
|
|
|
results = benchmark_load_factor_impact(initial_size=100, max_elements=1000, probe_type='linear', num_runs=30)
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
|
fig.suptitle('Performance Impact of Load Factor', fontsize=16, fontweight='bold')
|
|
|
|
# Extract data
|
|
oa_data = results['open_addressing']
|
|
sc_data = results['separate_chaining']
|
|
|
|
# Sort by load factor to avoid zig-zag lines
|
|
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
|
|
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
|
|
|
|
oa_load_factors = [r['load_factor'] for r in oa_sorted]
|
|
oa_insert_times = [r['insert_time'] for r in oa_sorted]
|
|
oa_insert_stds = [r.get('insert_time_std', 0) for r in oa_sorted]
|
|
oa_search_times = [r['search_time'] for r in oa_sorted]
|
|
oa_search_stds = [r.get('search_time_std', 0) for r in oa_sorted]
|
|
|
|
sc_load_factors = [r['load_factor'] for r in sc_sorted]
|
|
sc_insert_times = [r['insert_time'] for r in sc_sorted]
|
|
sc_insert_stds = [r.get('insert_time_std', 0) for r in sc_sorted]
|
|
sc_search_times = [r['search_time'] for r in sc_sorted]
|
|
sc_search_stds = [r.get('search_time_std', 0) for r in sc_sorted]
|
|
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
|
|
|
|
# Insert time vs load factor (per element) with error bars
|
|
ax = axes[0]
|
|
ax.errorbar(oa_load_factors, oa_insert_times, yerr=oa_insert_stds,
|
|
marker='o', label='Open Addressing (Linear)', linewidth=2,
|
|
capsize=3, capthick=1.5, alpha=0.8)
|
|
ax.errorbar(sc_load_factors, sc_insert_times, yerr=sc_insert_stds,
|
|
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
|
|
capsize=3, capthick=1.5, alpha=0.8)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Insert Time per Element (seconds)')
|
|
ax.set_title('Insert Time vs Load Factor', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
|
|
# Search time vs load factor (per element) with error bars
|
|
ax = axes[1]
|
|
ax.errorbar(oa_load_factors, oa_search_times, yerr=oa_search_stds,
|
|
marker='o', label='Open Addressing (Linear)', linewidth=2,
|
|
capsize=3, capthick=1.5, alpha=0.8)
|
|
ax.errorbar(sc_load_factors, sc_search_times, yerr=sc_search_stds,
|
|
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
|
|
capsize=3, capthick=1.5, alpha=0.8)
|
|
ax2 = ax.twinx()
|
|
# Chain length is smooth and accurate, so use line plot
|
|
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^',
|
|
label='Avg Chain Length (SC)', color='green', linestyle=':', linewidth=2)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Search Time per Element (seconds)', color='blue')
|
|
ax2.set_ylabel('Average Chain Length', color='green')
|
|
ax.set_title('Search Time vs Load Factor', fontweight='bold')
|
|
ax.legend(loc='upper left')
|
|
ax2.legend(loc='upper right')
|
|
ax.grid(alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact.png')
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
|
print(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_load_factor_impact_probes():
|
|
"""Plot probe counts and comparisons at different load factors.
|
|
|
|
Uses deterministic metrics (probe counts, comparisons) instead of timing
|
|
to produce smooth theoretical curves without measurement noise.
|
|
"""
|
|
print("Generating load factor impact plot (probe counts)...")
|
|
|
|
results = benchmark_load_factor_impact_probes(initial_size=100, max_elements=1000, probe_type='linear', num_runs=10)
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
|
fig.suptitle('Performance Impact of Load Factor (Probe Counts & Comparisons)', fontsize=16, fontweight='bold')
|
|
|
|
# Extract data
|
|
oa_data = results['open_addressing']
|
|
sc_data = results['separate_chaining']
|
|
|
|
# Sort by load factor
|
|
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
|
|
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
|
|
|
|
oa_load_factors = [r['load_factor'] for r in oa_sorted]
|
|
oa_insert_probes = [r['insert_probes_per_element'] for r in oa_sorted]
|
|
oa_search_probes = [r['search_probes_per_element'] for r in oa_sorted]
|
|
oa_insert_comparisons = [r['insert_comparisons_per_element'] for r in oa_sorted]
|
|
oa_search_comparisons = [r['search_comparisons_per_element'] for r in oa_sorted]
|
|
|
|
sc_load_factors = [r['load_factor'] for r in sc_sorted]
|
|
sc_insert_comparisons = [r['insert_comparisons_per_element'] for r in sc_sorted]
|
|
sc_search_comparisons = [r['search_comparisons_per_element'] for r in sc_sorted]
|
|
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
|
|
|
|
# Insert probes per element (Open Addressing)
|
|
ax = axes[0, 0]
|
|
ax.plot(oa_load_factors, oa_insert_probes, marker='o', label='Open Addressing (Linear)',
|
|
linewidth=2, color='blue', markersize=6)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Probes per Element')
|
|
ax.set_title('Insert: Probes per Element (Open Addressing)', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
|
|
# Search probes per element (Open Addressing)
|
|
ax = axes[0, 1]
|
|
ax.plot(oa_load_factors, oa_search_probes, marker='o', label='Open Addressing (Linear)',
|
|
linewidth=2, color='blue', markersize=6)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Probes per Element')
|
|
ax.set_title('Search: Probes per Element (Open Addressing)', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
|
|
# Comparisons per element (both methods)
|
|
ax = axes[1, 0]
|
|
ax.plot(oa_load_factors, oa_insert_comparisons, marker='o', label='Open Addressing (Linear)',
|
|
linewidth=2, color='blue', markersize=6)
|
|
ax.plot(sc_load_factors, sc_insert_comparisons, marker='s', label='Separate Chaining',
|
|
linewidth=2, linestyle='--', color='orange', markersize=6)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Comparisons per Element')
|
|
ax.set_title('Insert: Comparisons per Element', fontweight='bold')
|
|
ax.legend()
|
|
ax.grid(alpha=0.3)
|
|
|
|
# Search comparisons per element and chain length
|
|
ax = axes[1, 1]
|
|
ax.plot(oa_load_factors, oa_search_comparisons, marker='o', label='Open Addressing (Linear)',
|
|
linewidth=2, color='blue', markersize=6)
|
|
ax.plot(sc_load_factors, sc_search_comparisons, marker='s', label='Separate Chaining',
|
|
linewidth=2, linestyle='--', color='orange', markersize=6)
|
|
ax2 = ax.twinx()
|
|
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^', label='Avg Chain Length (SC)',
|
|
color='green', linestyle=':', linewidth=2, markersize=6)
|
|
ax.set_xlabel('Load Factor')
|
|
ax.set_ylabel('Comparisons per Element', color='blue')
|
|
ax2.set_ylabel('Average Chain Length', color='green')
|
|
ax.set_title('Search: Comparisons per Element', fontweight='bold')
|
|
ax.legend(loc='upper left')
|
|
ax2.legend(loc='upper right')
|
|
ax.grid(alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact_probes.png')
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
|
print(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_collision_analysis():
|
|
"""Plot collision analysis for different hash functions."""
|
|
print("Generating collision analysis plot...")
|
|
|
|
keys = generate_test_data(500)
|
|
table_size = 100
|
|
|
|
hash_funcs = {
|
|
'Division': division_hash,
|
|
'Multiplication': lambda k, s: multiplication_hash(k, s),
|
|
'Simple String': lambda k, s: string_hash_simple(str(k), s),
|
|
'Polynomial': lambda k, s: string_hash_polynomial(str(k), s),
|
|
'Bad Clustering': bad_hash_clustering,
|
|
}
|
|
|
|
results = benchmark_hash_functions(hash_funcs, keys, table_size)
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 6))
|
|
|
|
names = list(results.keys())
|
|
collision_counts = [results[n]['collisions'] for n in names]
|
|
colors = ['steelblue' if 'Bad' not in n else 'coral' for n in names]
|
|
|
|
bars = ax.bar(names, collision_counts, color=colors)
|
|
ax.set_xlabel('Hash Function', fontweight='bold')
|
|
ax.set_ylabel('Number of Collisions', fontweight='bold')
|
|
ax.set_title('Collision Analysis: Good vs Bad Hash Functions', fontsize=14, fontweight='bold')
|
|
ax.grid(axis='y', alpha=0.3)
|
|
ax.tick_params(axis='x', rotation=45)
|
|
|
|
# Add value labels on bars
|
|
for bar in bars:
|
|
height = bar.get_height()
|
|
ax.text(bar.get_x() + bar.get_width()/2., height,
|
|
f'{int(height)}',
|
|
ha='center', va='bottom', fontweight='bold')
|
|
|
|
plt.tight_layout()
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'collision_analysis.png')
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
|
print(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Ensure docs directory exists
|
|
os.makedirs(os.path.join(os.path.dirname(__file__), '..', 'docs'), exist_ok=True)
|
|
|
|
print("Generating visualization plots...")
|
|
print("This may take a few minutes...\n")
|
|
|
|
plot_hash_function_comparison()
|
|
plot_open_addressing_vs_chaining()
|
|
plot_load_factor_impact()
|
|
plot_load_factor_impact_probes()
|
|
plot_collision_analysis()
|
|
|
|
print("\nAll plots generated successfully!")
|
|
|