Adding hash algorithms
This commit is contained in:
389
examples/generate_plots.py
Normal file
389
examples/generate_plots.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Generate visualization plots for hash table performance analysis.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
from src.benchmark import (
|
||||
benchmark_hash_functions,
|
||||
benchmark_open_addressing_vs_chaining,
|
||||
benchmark_load_factor_impact,
|
||||
benchmark_load_factor_impact_probes,
|
||||
generate_test_data
|
||||
)
|
||||
from src.hash_functions import (
|
||||
division_hash,
|
||||
multiplication_hash,
|
||||
string_hash_simple,
|
||||
string_hash_polynomial,
|
||||
string_hash_djb2,
|
||||
bad_hash_clustering
|
||||
)
|
||||
|
||||
|
||||
def plot_hash_function_comparison():
|
||||
"""Compare different hash functions."""
|
||||
print("Generating hash function comparison plot...")
|
||||
|
||||
keys = generate_test_data(1000)
|
||||
table_size = 100
|
||||
|
||||
hash_funcs = {
|
||||
'Division': division_hash,
|
||||
'Multiplication': lambda k, s: multiplication_hash(k, s),
|
||||
'Simple String': lambda k, s: string_hash_simple(str(k), s),
|
||||
'Polynomial String': lambda k, s: string_hash_polynomial(str(k), s),
|
||||
'DJB2': lambda k, s: string_hash_djb2(str(k), s),
|
||||
'Bad Clustering': bad_hash_clustering,
|
||||
}
|
||||
|
||||
results = benchmark_hash_functions(hash_funcs, keys, table_size)
|
||||
|
||||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||||
fig.suptitle('Hash Function Performance Comparison', fontsize=16, fontweight='bold')
|
||||
|
||||
names = list(results.keys())
|
||||
collision_rates = [results[n]['collision_rate'] * 100 for n in names]
|
||||
variances = [results[n]['variance'] for n in names]
|
||||
times = [results[n]['time'] * 1000 for n in names] # Convert to ms
|
||||
max_chains = [results[n]['max_chain_length'] for n in names]
|
||||
|
||||
# Collision rate
|
||||
axes[0, 0].bar(names, collision_rates, color='steelblue')
|
||||
axes[0, 0].set_title('Collision Rate (%)', fontweight='bold')
|
||||
axes[0, 0].set_ylabel('Collision Rate (%)')
|
||||
axes[0, 0].tick_params(axis='x', rotation=45)
|
||||
axes[0, 0].grid(axis='y', alpha=0.3)
|
||||
|
||||
# Variance (distribution quality)
|
||||
axes[0, 1].bar(names, variances, color='coral')
|
||||
axes[0, 1].set_title('Distribution Variance (Lower is Better)', fontweight='bold')
|
||||
axes[0, 1].set_ylabel('Variance')
|
||||
axes[0, 1].tick_params(axis='x', rotation=45)
|
||||
axes[0, 1].grid(axis='y', alpha=0.3)
|
||||
|
||||
# Execution time
|
||||
axes[1, 0].bar(names, times, color='mediumseagreen')
|
||||
axes[1, 0].set_title('Execution Time', fontweight='bold')
|
||||
axes[1, 0].set_ylabel('Time (ms)')
|
||||
axes[1, 0].tick_params(axis='x', rotation=45)
|
||||
axes[1, 0].grid(axis='y', alpha=0.3)
|
||||
|
||||
# Max chain length
|
||||
axes[1, 1].bar(names, max_chains, color='plum')
|
||||
axes[1, 1].set_title('Maximum Chain Length', fontweight='bold')
|
||||
axes[1, 1].set_ylabel('Max Chain Length')
|
||||
axes[1, 1].tick_params(axis='x', rotation=45)
|
||||
axes[1, 1].grid(axis='y', alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'hash_function_comparison.png')
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_open_addressing_vs_chaining():
|
||||
"""Compare open addressing vs separate chaining."""
|
||||
print("Generating open addressing vs separate chaining comparison plot...")
|
||||
|
||||
sizes = [100, 500, 1000, 5000, 10000]
|
||||
results = benchmark_open_addressing_vs_chaining(sizes)
|
||||
|
||||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||||
fig.suptitle('Open Addressing vs Separate Chaining Performance', fontsize=16, fontweight='bold')
|
||||
|
||||
sizes_arr = np.array(sizes)
|
||||
|
||||
# Insert time
|
||||
ax = axes[0, 0]
|
||||
for probe_type in ['linear', 'quadratic', 'double']:
|
||||
insert_times = [r['insert_time'] for r in results['open_addressing'][probe_type]]
|
||||
ax.plot(sizes_arr, insert_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
||||
|
||||
insert_times_sc = [r['insert_time'] for r in results['separate_chaining']]
|
||||
ax.plot(sizes_arr, insert_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
||||
ax.set_xlabel('Number of Elements')
|
||||
ax.set_ylabel('Insert Time (seconds)')
|
||||
ax.set_title('Insert Performance', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
ax.set_xscale('log')
|
||||
ax.set_yscale('log')
|
||||
|
||||
# Search time
|
||||
ax = axes[0, 1]
|
||||
for probe_type in ['linear', 'quadratic', 'double']:
|
||||
search_times = [r['search_time'] for r in results['open_addressing'][probe_type]]
|
||||
ax.plot(sizes_arr, search_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
||||
|
||||
search_times_sc = [r['search_time'] for r in results['separate_chaining']]
|
||||
ax.plot(sizes_arr, search_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
||||
ax.set_xlabel('Number of Elements')
|
||||
ax.set_ylabel('Search Time (seconds)')
|
||||
ax.set_title('Search Performance', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
ax.set_xscale('log')
|
||||
ax.set_yscale('log')
|
||||
|
||||
# Delete time
|
||||
ax = axes[1, 0]
|
||||
for probe_type in ['linear', 'quadratic', 'double']:
|
||||
delete_times = [r['delete_time'] for r in results['open_addressing'][probe_type]]
|
||||
ax.plot(sizes_arr, delete_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
||||
|
||||
delete_times_sc = [r['delete_time'] for r in results['separate_chaining']]
|
||||
ax.plot(sizes_arr, delete_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
||||
ax.set_xlabel('Number of Elements')
|
||||
ax.set_ylabel('Delete Time (seconds)')
|
||||
ax.set_title('Delete Performance', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
ax.set_xscale('log')
|
||||
ax.set_yscale('log')
|
||||
|
||||
# Load factors
|
||||
ax = axes[1, 1]
|
||||
for probe_type in ['linear', 'quadratic', 'double']:
|
||||
load_factors = [r['load_factor'] for r in results['open_addressing'][probe_type]]
|
||||
ax.plot(sizes_arr, load_factors, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
|
||||
|
||||
load_factors_sc = [r['load_factor'] for r in results['separate_chaining']]
|
||||
ax.plot(sizes_arr, load_factors_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
|
||||
ax.set_xlabel('Number of Elements')
|
||||
ax.set_ylabel('Load Factor')
|
||||
ax.set_title('Load Factor', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
ax.set_xscale('log')
|
||||
|
||||
plt.tight_layout()
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'open_addressing_vs_chaining.png')
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_load_factor_impact():
|
||||
"""Plot performance at different load factors with statistical smoothing."""
|
||||
print("Generating load factor impact plot...")
|
||||
|
||||
results = benchmark_load_factor_impact(initial_size=100, max_elements=1000, probe_type='linear', num_runs=30)
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||||
fig.suptitle('Performance Impact of Load Factor', fontsize=16, fontweight='bold')
|
||||
|
||||
# Extract data
|
||||
oa_data = results['open_addressing']
|
||||
sc_data = results['separate_chaining']
|
||||
|
||||
# Sort by load factor to avoid zig-zag lines
|
||||
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
|
||||
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
|
||||
|
||||
oa_load_factors = [r['load_factor'] for r in oa_sorted]
|
||||
oa_insert_times = [r['insert_time'] for r in oa_sorted]
|
||||
oa_insert_stds = [r.get('insert_time_std', 0) for r in oa_sorted]
|
||||
oa_search_times = [r['search_time'] for r in oa_sorted]
|
||||
oa_search_stds = [r.get('search_time_std', 0) for r in oa_sorted]
|
||||
|
||||
sc_load_factors = [r['load_factor'] for r in sc_sorted]
|
||||
sc_insert_times = [r['insert_time'] for r in sc_sorted]
|
||||
sc_insert_stds = [r.get('insert_time_std', 0) for r in sc_sorted]
|
||||
sc_search_times = [r['search_time'] for r in sc_sorted]
|
||||
sc_search_stds = [r.get('search_time_std', 0) for r in sc_sorted]
|
||||
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
|
||||
|
||||
# Insert time vs load factor (per element) with error bars
|
||||
ax = axes[0]
|
||||
ax.errorbar(oa_load_factors, oa_insert_times, yerr=oa_insert_stds,
|
||||
marker='o', label='Open Addressing (Linear)', linewidth=2,
|
||||
capsize=3, capthick=1.5, alpha=0.8)
|
||||
ax.errorbar(sc_load_factors, sc_insert_times, yerr=sc_insert_stds,
|
||||
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
|
||||
capsize=3, capthick=1.5, alpha=0.8)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Insert Time per Element (seconds)')
|
||||
ax.set_title('Insert Time vs Load Factor', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
# Search time vs load factor (per element) with error bars
|
||||
ax = axes[1]
|
||||
ax.errorbar(oa_load_factors, oa_search_times, yerr=oa_search_stds,
|
||||
marker='o', label='Open Addressing (Linear)', linewidth=2,
|
||||
capsize=3, capthick=1.5, alpha=0.8)
|
||||
ax.errorbar(sc_load_factors, sc_search_times, yerr=sc_search_stds,
|
||||
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
|
||||
capsize=3, capthick=1.5, alpha=0.8)
|
||||
ax2 = ax.twinx()
|
||||
# Chain length is smooth and accurate, so use line plot
|
||||
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^',
|
||||
label='Avg Chain Length (SC)', color='green', linestyle=':', linewidth=2)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Search Time per Element (seconds)', color='blue')
|
||||
ax2.set_ylabel('Average Chain Length', color='green')
|
||||
ax.set_title('Search Time vs Load Factor', fontweight='bold')
|
||||
ax.legend(loc='upper left')
|
||||
ax2.legend(loc='upper right')
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact.png')
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_load_factor_impact_probes():
|
||||
"""Plot probe counts and comparisons at different load factors.
|
||||
|
||||
Uses deterministic metrics (probe counts, comparisons) instead of timing
|
||||
to produce smooth theoretical curves without measurement noise.
|
||||
"""
|
||||
print("Generating load factor impact plot (probe counts)...")
|
||||
|
||||
results = benchmark_load_factor_impact_probes(initial_size=100, max_elements=1000, probe_type='linear', num_runs=10)
|
||||
|
||||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||||
fig.suptitle('Performance Impact of Load Factor (Probe Counts & Comparisons)', fontsize=16, fontweight='bold')
|
||||
|
||||
# Extract data
|
||||
oa_data = results['open_addressing']
|
||||
sc_data = results['separate_chaining']
|
||||
|
||||
# Sort by load factor
|
||||
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
|
||||
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
|
||||
|
||||
oa_load_factors = [r['load_factor'] for r in oa_sorted]
|
||||
oa_insert_probes = [r['insert_probes_per_element'] for r in oa_sorted]
|
||||
oa_search_probes = [r['search_probes_per_element'] for r in oa_sorted]
|
||||
oa_insert_comparisons = [r['insert_comparisons_per_element'] for r in oa_sorted]
|
||||
oa_search_comparisons = [r['search_comparisons_per_element'] for r in oa_sorted]
|
||||
|
||||
sc_load_factors = [r['load_factor'] for r in sc_sorted]
|
||||
sc_insert_comparisons = [r['insert_comparisons_per_element'] for r in sc_sorted]
|
||||
sc_search_comparisons = [r['search_comparisons_per_element'] for r in sc_sorted]
|
||||
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
|
||||
|
||||
# Insert probes per element (Open Addressing)
|
||||
ax = axes[0, 0]
|
||||
ax.plot(oa_load_factors, oa_insert_probes, marker='o', label='Open Addressing (Linear)',
|
||||
linewidth=2, color='blue', markersize=6)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Probes per Element')
|
||||
ax.set_title('Insert: Probes per Element (Open Addressing)', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
# Search probes per element (Open Addressing)
|
||||
ax = axes[0, 1]
|
||||
ax.plot(oa_load_factors, oa_search_probes, marker='o', label='Open Addressing (Linear)',
|
||||
linewidth=2, color='blue', markersize=6)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Probes per Element')
|
||||
ax.set_title('Search: Probes per Element (Open Addressing)', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
# Comparisons per element (both methods)
|
||||
ax = axes[1, 0]
|
||||
ax.plot(oa_load_factors, oa_insert_comparisons, marker='o', label='Open Addressing (Linear)',
|
||||
linewidth=2, color='blue', markersize=6)
|
||||
ax.plot(sc_load_factors, sc_insert_comparisons, marker='s', label='Separate Chaining',
|
||||
linewidth=2, linestyle='--', color='orange', markersize=6)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Comparisons per Element')
|
||||
ax.set_title('Insert: Comparisons per Element', fontweight='bold')
|
||||
ax.legend()
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
# Search comparisons per element and chain length
|
||||
ax = axes[1, 1]
|
||||
ax.plot(oa_load_factors, oa_search_comparisons, marker='o', label='Open Addressing (Linear)',
|
||||
linewidth=2, color='blue', markersize=6)
|
||||
ax.plot(sc_load_factors, sc_search_comparisons, marker='s', label='Separate Chaining',
|
||||
linewidth=2, linestyle='--', color='orange', markersize=6)
|
||||
ax2 = ax.twinx()
|
||||
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^', label='Avg Chain Length (SC)',
|
||||
color='green', linestyle=':', linewidth=2, markersize=6)
|
||||
ax.set_xlabel('Load Factor')
|
||||
ax.set_ylabel('Comparisons per Element', color='blue')
|
||||
ax2.set_ylabel('Average Chain Length', color='green')
|
||||
ax.set_title('Search: Comparisons per Element', fontweight='bold')
|
||||
ax.legend(loc='upper left')
|
||||
ax2.legend(loc='upper right')
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact_probes.png')
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_collision_analysis():
|
||||
"""Plot collision analysis for different hash functions."""
|
||||
print("Generating collision analysis plot...")
|
||||
|
||||
keys = generate_test_data(500)
|
||||
table_size = 100
|
||||
|
||||
hash_funcs = {
|
||||
'Division': division_hash,
|
||||
'Multiplication': lambda k, s: multiplication_hash(k, s),
|
||||
'Simple String': lambda k, s: string_hash_simple(str(k), s),
|
||||
'Polynomial': lambda k, s: string_hash_polynomial(str(k), s),
|
||||
'Bad Clustering': bad_hash_clustering,
|
||||
}
|
||||
|
||||
results = benchmark_hash_functions(hash_funcs, keys, table_size)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(12, 6))
|
||||
|
||||
names = list(results.keys())
|
||||
collision_counts = [results[n]['collisions'] for n in names]
|
||||
colors = ['steelblue' if 'Bad' not in n else 'coral' for n in names]
|
||||
|
||||
bars = ax.bar(names, collision_counts, color=colors)
|
||||
ax.set_xlabel('Hash Function', fontweight='bold')
|
||||
ax.set_ylabel('Number of Collisions', fontweight='bold')
|
||||
ax.set_title('Collision Analysis: Good vs Bad Hash Functions', fontsize=14, fontweight='bold')
|
||||
ax.grid(axis='y', alpha=0.3)
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
|
||||
# Add value labels on bars
|
||||
for bar in bars:
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width()/2., height,
|
||||
f'{int(height)}',
|
||||
ha='center', va='bottom', fontweight='bold')
|
||||
|
||||
plt.tight_layout()
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'collision_analysis.png')
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure docs directory exists
|
||||
os.makedirs(os.path.join(os.path.dirname(__file__), '..', 'docs'), exist_ok=True)
|
||||
|
||||
print("Generating visualization plots...")
|
||||
print("This may take a few minutes...\n")
|
||||
|
||||
plot_hash_function_comparison()
|
||||
plot_open_addressing_vs_chaining()
|
||||
plot_load_factor_impact()
|
||||
plot_load_factor_impact_probes()
|
||||
plot_collision_analysis()
|
||||
|
||||
print("\nAll plots generated successfully!")
|
||||
|
||||
Reference in New Issue
Block a user