Adding hash algorithms

This commit is contained in:
Carlos Gutierrez
2025-11-24 17:11:51 -05:00
commit 7416dfed38
25 changed files with 10315 additions and 0 deletions

389
examples/generate_plots.py Normal file
View File

@@ -0,0 +1,389 @@
"""
Generate visualization plots for hash table performance analysis.
"""
import sys
import os
import matplotlib.pyplot as plt
import numpy as np
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from src.benchmark import (
benchmark_hash_functions,
benchmark_open_addressing_vs_chaining,
benchmark_load_factor_impact,
benchmark_load_factor_impact_probes,
generate_test_data
)
from src.hash_functions import (
division_hash,
multiplication_hash,
string_hash_simple,
string_hash_polynomial,
string_hash_djb2,
bad_hash_clustering
)
def plot_hash_function_comparison():
"""Compare different hash functions."""
print("Generating hash function comparison plot...")
keys = generate_test_data(1000)
table_size = 100
hash_funcs = {
'Division': division_hash,
'Multiplication': lambda k, s: multiplication_hash(k, s),
'Simple String': lambda k, s: string_hash_simple(str(k), s),
'Polynomial String': lambda k, s: string_hash_polynomial(str(k), s),
'DJB2': lambda k, s: string_hash_djb2(str(k), s),
'Bad Clustering': bad_hash_clustering,
}
results = benchmark_hash_functions(hash_funcs, keys, table_size)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Hash Function Performance Comparison', fontsize=16, fontweight='bold')
names = list(results.keys())
collision_rates = [results[n]['collision_rate'] * 100 for n in names]
variances = [results[n]['variance'] for n in names]
times = [results[n]['time'] * 1000 for n in names] # Convert to ms
max_chains = [results[n]['max_chain_length'] for n in names]
# Collision rate
axes[0, 0].bar(names, collision_rates, color='steelblue')
axes[0, 0].set_title('Collision Rate (%)', fontweight='bold')
axes[0, 0].set_ylabel('Collision Rate (%)')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)
# Variance (distribution quality)
axes[0, 1].bar(names, variances, color='coral')
axes[0, 1].set_title('Distribution Variance (Lower is Better)', fontweight='bold')
axes[0, 1].set_ylabel('Variance')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)
# Execution time
axes[1, 0].bar(names, times, color='mediumseagreen')
axes[1, 0].set_title('Execution Time', fontweight='bold')
axes[1, 0].set_ylabel('Time (ms)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)
# Max chain length
axes[1, 1].bar(names, max_chains, color='plum')
axes[1, 1].set_title('Maximum Chain Length', fontweight='bold')
axes[1, 1].set_ylabel('Max Chain Length')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'hash_function_comparison.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_open_addressing_vs_chaining():
"""Compare open addressing vs separate chaining."""
print("Generating open addressing vs separate chaining comparison plot...")
sizes = [100, 500, 1000, 5000, 10000]
results = benchmark_open_addressing_vs_chaining(sizes)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Open Addressing vs Separate Chaining Performance', fontsize=16, fontweight='bold')
sizes_arr = np.array(sizes)
# Insert time
ax = axes[0, 0]
for probe_type in ['linear', 'quadratic', 'double']:
insert_times = [r['insert_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, insert_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
insert_times_sc = [r['insert_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, insert_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Insert Time (seconds)')
ax.set_title('Insert Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Search time
ax = axes[0, 1]
for probe_type in ['linear', 'quadratic', 'double']:
search_times = [r['search_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, search_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
search_times_sc = [r['search_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, search_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Search Time (seconds)')
ax.set_title('Search Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Delete time
ax = axes[1, 0]
for probe_type in ['linear', 'quadratic', 'double']:
delete_times = [r['delete_time'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, delete_times, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
delete_times_sc = [r['delete_time'] for r in results['separate_chaining']]
ax.plot(sizes_arr, delete_times_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Delete Time (seconds)')
ax.set_title('Delete Performance', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')
# Load factors
ax = axes[1, 1]
for probe_type in ['linear', 'quadratic', 'double']:
load_factors = [r['load_factor'] for r in results['open_addressing'][probe_type]]
ax.plot(sizes_arr, load_factors, marker='o', label=f'Open Addressing ({probe_type})', linewidth=2)
load_factors_sc = [r['load_factor'] for r in results['separate_chaining']]
ax.plot(sizes_arr, load_factors_sc, marker='s', label='Separate Chaining', linewidth=2, linestyle='--')
ax.set_xlabel('Number of Elements')
ax.set_ylabel('Load Factor')
ax.set_title('Load Factor', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xscale('log')
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'open_addressing_vs_chaining.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_load_factor_impact():
"""Plot performance at different load factors with statistical smoothing."""
print("Generating load factor impact plot...")
results = benchmark_load_factor_impact(initial_size=100, max_elements=1000, probe_type='linear', num_runs=30)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Performance Impact of Load Factor', fontsize=16, fontweight='bold')
# Extract data
oa_data = results['open_addressing']
sc_data = results['separate_chaining']
# Sort by load factor to avoid zig-zag lines
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
oa_load_factors = [r['load_factor'] for r in oa_sorted]
oa_insert_times = [r['insert_time'] for r in oa_sorted]
oa_insert_stds = [r.get('insert_time_std', 0) for r in oa_sorted]
oa_search_times = [r['search_time'] for r in oa_sorted]
oa_search_stds = [r.get('search_time_std', 0) for r in oa_sorted]
sc_load_factors = [r['load_factor'] for r in sc_sorted]
sc_insert_times = [r['insert_time'] for r in sc_sorted]
sc_insert_stds = [r.get('insert_time_std', 0) for r in sc_sorted]
sc_search_times = [r['search_time'] for r in sc_sorted]
sc_search_stds = [r.get('search_time_std', 0) for r in sc_sorted]
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
# Insert time vs load factor (per element) with error bars
ax = axes[0]
ax.errorbar(oa_load_factors, oa_insert_times, yerr=oa_insert_stds,
marker='o', label='Open Addressing (Linear)', linewidth=2,
capsize=3, capthick=1.5, alpha=0.8)
ax.errorbar(sc_load_factors, sc_insert_times, yerr=sc_insert_stds,
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
capsize=3, capthick=1.5, alpha=0.8)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Insert Time per Element (seconds)')
ax.set_title('Insert Time vs Load Factor', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search time vs load factor (per element) with error bars
ax = axes[1]
ax.errorbar(oa_load_factors, oa_search_times, yerr=oa_search_stds,
marker='o', label='Open Addressing (Linear)', linewidth=2,
capsize=3, capthick=1.5, alpha=0.8)
ax.errorbar(sc_load_factors, sc_search_times, yerr=sc_search_stds,
marker='s', label='Separate Chaining', linewidth=2, linestyle='--',
capsize=3, capthick=1.5, alpha=0.8)
ax2 = ax.twinx()
# Chain length is smooth and accurate, so use line plot
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^',
label='Avg Chain Length (SC)', color='green', linestyle=':', linewidth=2)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Search Time per Element (seconds)', color='blue')
ax2.set_ylabel('Average Chain Length', color='green')
ax.set_title('Search Time vs Load Factor', fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_load_factor_impact_probes():
"""Plot probe counts and comparisons at different load factors.
Uses deterministic metrics (probe counts, comparisons) instead of timing
to produce smooth theoretical curves without measurement noise.
"""
print("Generating load factor impact plot (probe counts)...")
results = benchmark_load_factor_impact_probes(initial_size=100, max_elements=1000, probe_type='linear', num_runs=10)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Performance Impact of Load Factor (Probe Counts & Comparisons)', fontsize=16, fontweight='bold')
# Extract data
oa_data = results['open_addressing']
sc_data = results['separate_chaining']
# Sort by load factor
oa_sorted = sorted(oa_data, key=lambda x: x['load_factor'])
sc_sorted = sorted(sc_data, key=lambda x: x['load_factor'])
oa_load_factors = [r['load_factor'] for r in oa_sorted]
oa_insert_probes = [r['insert_probes_per_element'] for r in oa_sorted]
oa_search_probes = [r['search_probes_per_element'] for r in oa_sorted]
oa_insert_comparisons = [r['insert_comparisons_per_element'] for r in oa_sorted]
oa_search_comparisons = [r['search_comparisons_per_element'] for r in oa_sorted]
sc_load_factors = [r['load_factor'] for r in sc_sorted]
sc_insert_comparisons = [r['insert_comparisons_per_element'] for r in sc_sorted]
sc_search_comparisons = [r['search_comparisons_per_element'] for r in sc_sorted]
sc_chain_lengths = [r['avg_chain_length'] for r in sc_sorted]
# Insert probes per element (Open Addressing)
ax = axes[0, 0]
ax.plot(oa_load_factors, oa_insert_probes, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Probes per Element')
ax.set_title('Insert: Probes per Element (Open Addressing)', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search probes per element (Open Addressing)
ax = axes[0, 1]
ax.plot(oa_load_factors, oa_search_probes, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Probes per Element')
ax.set_title('Search: Probes per Element (Open Addressing)', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Comparisons per element (both methods)
ax = axes[1, 0]
ax.plot(oa_load_factors, oa_insert_comparisons, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.plot(sc_load_factors, sc_insert_comparisons, marker='s', label='Separate Chaining',
linewidth=2, linestyle='--', color='orange', markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Comparisons per Element')
ax.set_title('Insert: Comparisons per Element', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
# Search comparisons per element and chain length
ax = axes[1, 1]
ax.plot(oa_load_factors, oa_search_comparisons, marker='o', label='Open Addressing (Linear)',
linewidth=2, color='blue', markersize=6)
ax.plot(sc_load_factors, sc_search_comparisons, marker='s', label='Separate Chaining',
linewidth=2, linestyle='--', color='orange', markersize=6)
ax2 = ax.twinx()
ax2.plot(sc_load_factors, sc_chain_lengths, marker='^', label='Avg Chain Length (SC)',
color='green', linestyle=':', linewidth=2, markersize=6)
ax.set_xlabel('Load Factor')
ax.set_ylabel('Comparisons per Element', color='blue')
ax2.set_ylabel('Average Chain Length', color='green')
ax.set_title('Search: Comparisons per Element', fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(alpha=0.3)
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'load_factor_impact_probes.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
def plot_collision_analysis():
"""Plot collision analysis for different hash functions."""
print("Generating collision analysis plot...")
keys = generate_test_data(500)
table_size = 100
hash_funcs = {
'Division': division_hash,
'Multiplication': lambda k, s: multiplication_hash(k, s),
'Simple String': lambda k, s: string_hash_simple(str(k), s),
'Polynomial': lambda k, s: string_hash_polynomial(str(k), s),
'Bad Clustering': bad_hash_clustering,
}
results = benchmark_hash_functions(hash_funcs, keys, table_size)
fig, ax = plt.subplots(figsize=(12, 6))
names = list(results.keys())
collision_counts = [results[n]['collisions'] for n in names]
colors = ['steelblue' if 'Bad' not in n else 'coral' for n in names]
bars = ax.bar(names, collision_counts, color=colors)
ax.set_xlabel('Hash Function', fontweight='bold')
ax.set_ylabel('Number of Collisions', fontweight='bold')
ax.set_title('Collision Analysis: Good vs Bad Hash Functions', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
ax.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
output_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'collision_analysis.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Saved: {output_path}")
plt.close()
if __name__ == "__main__":
# Ensure docs directory exists
os.makedirs(os.path.join(os.path.dirname(__file__), '..', 'docs'), exist_ok=True)
print("Generating visualization plots...")
print("This may take a few minutes...\n")
plot_hash_function_comparison()
plot_open_addressing_vs_chaining()
plot_load_factor_impact()
plot_load_factor_impact_probes()
plot_collision_analysis()
print("\nAll plots generated successfully!")

View File

@@ -0,0 +1,207 @@
"""
Demonstration of hash table implementations and their usage.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from src.hash_tables import (
DirectAddressTable,
HashTableOpenAddressing,
HashTableSeparateChaining
)
from src.hash_functions import (
division_hash,
string_hash_polynomial,
string_hash_simple
)
def demo_direct_address_table():
"""Demonstrate direct-address table."""
print("=" * 60)
print("Direct-Address Table Demonstration")
print("=" * 60)
# Create table for keys in range [0, 99]
table = DirectAddressTable(100)
# Insert some values
table.insert(5, "Alice")
table.insert(42, "Bob")
table.insert(99, "Charlie")
print("\nInserted key-value pairs:")
print(" Key 5 ->", table.search(5))
print(" Key 42 ->", table.search(42))
print(" Key 99 ->", table.search(99))
# Search
print("\nSearching for key 42:", table.search(42))
print("Searching for key 10:", table.search(10)) # Not found
# Delete
table.delete(42)
print("\nAfter deleting key 42:")
print(" Key 42 ->", table.search(42)) # None
print()
def demo_open_addressing():
"""Demonstrate open addressing hash table."""
print("=" * 60)
print("Open Addressing Hash Table Demonstration")
print("=" * 60)
# Test with linear probing
print("\n--- Linear Probing ---")
ht_linear = HashTableOpenAddressing(10, probe_type='linear')
keys = [10, 22, 31, 4, 15, 28, 17, 88, 59]
for key in keys:
ht_linear.insert(key, f"Value_{key}")
print(f"Inserted {len(keys)} keys")
print(f"Load factor: {ht_linear._load_factor():.2f}")
print("\nSearching for keys:")
for key in [10, 22, 88, 99]:
result = ht_linear.search(key)
print(f" Key {key}: {'Found' if result else 'Not found'}")
# Test with quadratic probing
print("\n--- Quadratic Probing ---")
# Use larger table size for quadratic probing to avoid probe sequence issues
ht_quad = HashTableOpenAddressing(20, probe_type='quadratic')
for key in keys:
ht_quad.insert(key, f"Value_{key}")
print(f"Inserted {len(keys)} keys")
print(f"Load factor: {ht_quad._load_factor():.2f}")
# Test with double hashing
print("\n--- Double Hashing ---")
# Use larger table size for double hashing to ensure all keys can be inserted
ht_double = HashTableOpenAddressing(20, probe_type='double')
for key in keys:
ht_double.insert(key, f"Value_{key}")
print(f"Inserted {len(keys)} keys")
print(f"Load factor: {ht_double._load_factor():.2f}")
print()
def demo_separate_chaining():
"""Demonstrate separate chaining hash table."""
print("=" * 60)
print("Separate Chaining Hash Table Demonstration")
print("=" * 60)
ht = HashTableSeparateChaining(10)
keys = [10, 22, 31, 4, 15, 28, 17, 88, 59, 71]
for key in keys:
ht.insert(key, f"Value_{key}")
print(f"\nInserted {len(keys)} keys")
print(f"Load factor: {ht._load_factor():.2f}")
chain_lengths = ht.get_chain_lengths()
print(f"Chain lengths: {chain_lengths}")
print(f"Average chain length: {sum(chain_lengths) / len(chain_lengths):.2f}")
print(f"Maximum chain length: {max(chain_lengths)}")
print("\nSearching for keys:")
for key in [10, 22, 88, 99]:
result = ht.search(key)
print(f" Key {key}: {'Found' if result else 'Not found'}")
# Delete some keys
print("\nDeleting keys 22 and 88:")
ht.delete(22)
ht.delete(88)
print(f" Key 22: {'Found' if ht.search(22) else 'Not found'}")
print(f" Key 88: {'Found' if ht.search(88) else 'Not found'}")
print()
def demo_hash_functions():
"""Demonstrate different hash functions."""
print("=" * 60)
print("Hash Function Demonstration")
print("=" * 60)
keys = [10, 22, 31, 4, 15, 28, 17, 88, 59, 71]
table_size = 11
print(f"\nKeys: {keys}")
print(f"Table size: {table_size}\n")
# Division method
print("Division method (h(k) = k mod m):")
for key in keys[:5]:
hash_val = division_hash(key, table_size)
print(f" h({key}) = {hash_val}")
# String hashing
print("\nString hash functions:")
string_keys = ["hello", "world", "hash", "table", "test"]
print("Simple string hash (BAD - prone to collisions):")
for key in string_keys:
hash_val = string_hash_simple(key, table_size)
print(f" h('{key}') = {hash_val}")
print("\nPolynomial string hash (GOOD - better distribution):")
for key in string_keys:
hash_val = string_hash_polynomial(key, table_size)
print(f" h('{key}') = {hash_val}")
print()
def demo_collision_comparison():
"""Demonstrate collision behavior with different hash functions."""
print("=" * 60)
print("Collision Comparison Demonstration")
print("=" * 60)
# Generate test keys
keys = list(range(100, 200))
table_size = 50
from src.hash_functions import (
division_hash,
multiplication_hash,
string_hash_simple,
string_hash_polynomial
)
hash_funcs = {
'Division': division_hash,
'Multiplication': lambda k, s: multiplication_hash(k, s),
}
print(f"\nTesting with {len(keys)} keys and table size {table_size}\n")
for name, hash_func in hash_funcs.items():
hash_values = [hash_func(k, table_size) for k in keys]
collisions = len(keys) - len(set(hash_values))
collision_rate = collisions / len(keys) * 100
print(f"{name} method:")
print(f" Collisions: {collisions}")
print(f" Collision rate: {collision_rate:.2f}%")
print(f" Buckets used: {len(set(hash_values))}/{table_size}")
print()
if __name__ == "__main__":
demo_direct_address_table()
demo_open_addressing()
demo_separate_chaining()
demo_hash_functions()
demo_collision_comparison()