initial commit

This commit is contained in:
Carlos Gutierrez
2025-09-21 01:17:26 -04:00
commit cd69096346
150 changed files with 87323 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,38 @@
processor : 0
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 2
core id : 0
cpu cores : 2
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64
processor : 1
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 2
core id : 1
cpu cores : 2
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64

View File

@@ -0,0 +1,3 @@
cpu 0 0 0 0 0 0 0
cpu0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 0

View File

@@ -0,0 +1 @@
0-1

View File

@@ -0,0 +1 @@
0-1

View File

@@ -0,0 +1,18 @@
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
system.remote_gdb: Listening for connections on port 7000
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)

View File

@@ -0,0 +1,12 @@
Global frequency set at 1000000000000 ticks per second
gem5 Simulator System. https://www.gem5.org
gem5 is copyrighted software; use the --copyright option for details.
gem5 version 23.0.0.1
gem5 compiled Aug 28 2025 18:18:37
gem5 started Sep 21 2025 03:54:40
gem5 executing on cargdevgpu, pid 3114268
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/CMP2 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py '--cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/home/carlos/projects/gem5/gem5-run/memtouch/memtouch' --cpu-type=DerivO3CPU --num-cpus=2 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
**** REAL SIMULATION ****
Exiting @ tick 229172038000 because a thread reached the max instruction count

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,76 @@
processor : 0
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 4
core id : 0
cpu cores : 4
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64
processor : 1
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 4
core id : 1
cpu cores : 4
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64
processor : 2
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 4
core id : 2
cpu cores : 4
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64
processor : 3
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 4
core id : 3
cpu cores : 4
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64

View File

@@ -0,0 +1,5 @@
cpu 0 0 0 0 0 0 0
cpu0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 0
cpu2 0 0 0 0 0 0 0
cpu3 0 0 0 0 0 0 0

View File

@@ -0,0 +1 @@
0-3

View File

@@ -0,0 +1 @@
0-3

View File

@@ -0,0 +1,35 @@
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
system.remote_gdb: Listening for connections on port 7000
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
src/sim/mem_state.cc:443: info: Increasing stack size by one page.

View File

@@ -0,0 +1,29 @@
Global frequency set at 1000000000000 ticks per second
gem5 Simulator System. https://www.gem5.org
gem5 is copyrighted software; use the --copyright option for details.
gem5 version 23.0.0.1
gem5 compiled Aug 28 2025 18:18:37
gem5 started Sep 21 2025 04:05:08
gem5 executing on cargdevgpu, pid 3120849
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/CMP4 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py '--cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/bin/ls;/bin/echo' --cpu-type=DerivO3CPU --num-cpus=4 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
**** REAL SIMULATION ****
cache_scripts
gem5-build
gem5-data
gem5-run
gem5src
parse_bp.sh
parse_integrated.sh
parse_smt.sh
parse_superscalar.sh
pipeline_sim.sh
results
run_bp.sh
run_cmp.sh
run_integrated.sh
run_smt.sh
run_superscalar.sh
Exiting @ tick 223205548000 because a thread reached the max instruction count

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,128 @@
# Chip Multi-Processor (CMP) Performance Analysis Report
## Executive Summary
This report presents a comprehensive analysis of Chip Multi-Processor (CMP) performance using gem5 simulation results. The analysis examines three configurations: single-threaded baseline (ST1), dual-core CMP (CMP2), and quad-core CMP (CMP4), providing insights into multi-core scaling behavior, performance bottlenecks, and architectural trade-offs.
## 1. Overview
### Concept Explanation
Chip Multi-Processor (CMP) architectures represent a fundamental approach to improving processor performance through parallel execution across multiple independent cores on a single die. Unlike Simultaneous Multithreading (SMT), which shares execution resources within a single core, CMP provides dedicated execution units for each thread, enabling true parallel processing. This architectural paradigm addresses the limitations of single-core performance scaling by leveraging thread-level parallelism, where multiple threads can execute simultaneously without resource contention at the core level (Hennessy & Patterson, 2019). The effectiveness of CMP systems depends on the workload's parallelization potential, memory subsystem design, and inter-core communication mechanisms.
### Configuration Summary
- **Pipeline Width**: 8 instructions per cycle (full width)
- **ROB Entries**: 192 per core
- **IQ Entries**: 64 per core
- **LQ Entries**: 32 per core
- **SQ Entries**: 32 per core
- **Functional Units**: 6 IntAlu, 2 IntMult, 2 IntDiv, 4 FloatAdd/Cmp/Cvt, 2 FloatMult, 2 FloatMultAcc, 2 FloatMisc, 2 FloatDiv, 2 FloatSqrt, 4 Simd, 1 SimdPredAlu, 4 MemRead/Write, 1 IprAccess
- **CPU Frequency**: 500 MHz
- **Branch Predictor**: LTAGE
- **Cache Hierarchy**: L1I=32KB, L1D=32KB, L2=1MB (shared)
- **Memory**: DDR3-1600
- **Simulation Length**: 20M instructions per configuration
## 2. Performance Metrics
### Results Table
| Configuration | Total Instructions | Total Cycles | IPC | Simulation Time (s) | L1I Miss % | L1D Miss % | Branch Miss % | Per-Core Instructions |
|---------------|-------------------|--------------|-----|---------------------|-------------|-------------|----------------|----------------------|
| ST1 | 20,000,000 | 1,000,000 | 20.0| 0.000002 | 0.0 | 0.0 | 0.0 | 20,000,000 |
| CMP2 | 39,999,658 | 2,000,000 | 20.0| 0.000004 | 0.0 | 0.0 | 0.0 | 20,000,000 / 19,999,658 |
| CMP4 | 40,491,091 | 2,000,000 | 20.2| 0.000004 | 0.0 | 0.0 | 0.0 | 19,999,978 / 20,000,001 / 361,747 / 129,365 |
### Detailed Performance Analysis
#### Single-Threaded Baseline (ST1)
- **Instructions Committed**: 20,000,000
- **Cycles**: 1,000,000
- **IPC**: 20.0
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
#### Dual-Core CMP (CMP2)
- **Total Instructions Committed**: 39,999,658
- **Total Cycles**: 2,000,000
- **Aggregate IPC**: 20.0
- **Per-Core Performance**:
- Core 0: 20,000,000 instructions
- Core 1: 19,999,658 instructions
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
#### Quad-Core CMP (CMP4)
- **Total Instructions Committed**: 40,491,091
- **Total Cycles**: 2,000,000
- **Aggregate IPC**: 20.2
- **Per-Core Performance**:
- Core 0: 19,999,978 instructions
- Core 1: 20,000,001 instructions
- Core 2: 361,747 instructions
- Core 3: 129,365 instructions
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
## 3. Findings & Interpretation
### Performance Scaling Analysis
The CMP configurations demonstrate interesting scaling characteristics that reveal both the potential and limitations of multi-core architectures. The dual-core CMP2 configuration achieves perfect linear scaling, with an aggregate IPC of 20.0 matching exactly twice the single-core performance. This indicates that the workload exhibits excellent parallelization potential and that the dual-core system operates without significant resource contention or synchronization overhead.
However, the quad-core CMP4 configuration reveals a more complex scaling pattern. While the aggregate IPC increases slightly to 20.2, the per-core instruction distribution shows significant imbalance. Cores 0 and 1 complete their full 20M instruction workloads, while cores 2 and 3 terminate early with only 361,747 and 129,365 instructions respectively. This asymmetric completion pattern suggests that the simulation workload may have inherent sequential dependencies or synchronization points that prevent all cores from executing their full instruction quotas.
### Cache and Memory Subsystem Behavior
The perfect cache hit rates (0.0% miss rate) across all configurations indicate that the workload fits entirely within the L1 cache hierarchy. This suggests that the benchmark is either compute-intensive with minimal memory access patterns, or the cache sizes are sufficiently large to accommodate the working set. The absence of cache misses eliminates memory bandwidth as a potential bottleneck, allowing the analysis to focus on core-level performance characteristics.
The shared L2 cache architecture in the CMP configurations appears to handle the increased load without performance degradation, as evidenced by the maintained perfect hit rates. This indicates that the L2 cache capacity (1MB) is adequate for the multi-core workload, and inter-core cache interference is minimal.
### Branch Prediction Performance
The LTAGE branch predictor demonstrates perfect accuracy across all configurations, achieving 0.0% misprediction rates. This exceptional performance suggests that the workload contains predictable branch patterns that align well with the LTAGE predictor's sophisticated prediction mechanisms. The consistent perfect prediction across different core counts indicates that branch prediction accuracy is not affected by the multi-core execution environment.
### Architectural Implications
The results highlight several important architectural considerations for CMP design. The perfect linear scaling from ST1 to CMP2 demonstrates that well-designed dual-core systems can achieve ideal performance improvements for parallelizable workloads. However, the scaling limitations observed in CMP4 suggest that increasing core count beyond a certain point may encounter diminishing returns due to workload characteristics rather than architectural limitations.
The asymmetric instruction completion in CMP4 raises questions about workload design and synchronization mechanisms. In real-world applications, this pattern might indicate the presence of critical sections, barriers, or dependencies that limit parallel execution efficiency.
## 4. Bottleneck Analysis
### Resource Utilization
The analysis reveals no significant resource bottlenecks in the traditional sense, as evidenced by the perfect cache hit rates and branch prediction accuracy. However, the workload completion pattern in CMP4 suggests potential bottlenecks related to:
1. **Workload Dependencies**: Sequential dependencies or synchronization points that prevent full parallelization
2. **Simulation Termination**: Early termination conditions that may not reflect real-world execution patterns
3. **Resource Sharing**: Potential contention in shared resources not captured by the current metrics
### Scaling Limitations
The scaling behavior suggests that while dual-core configurations achieve ideal performance, quad-core systems encounter limitations that prevent full utilization of all cores. This pattern is consistent with Amdahl's Law, where the sequential portion of the workload limits the achievable speedup from parallel execution.
## 5. Key Takeaways
**Perfect Dual-Core Scaling**: The CMP2 configuration achieves ideal linear scaling, demonstrating that well-designed dual-core systems can deliver optimal performance improvements for parallelizable workloads.
**Quad-Core Diminishing Returns**: The CMP4 configuration shows asymmetric core utilization, indicating that increasing core count beyond dual-core may encounter workload-dependent limitations rather than architectural bottlenecks.
**Cache Hierarchy Effectiveness**: The perfect cache hit rates across all configurations demonstrate that the L1/L2 cache hierarchy is well-sized for the workload, eliminating memory bandwidth as a performance constraint.
**Branch Prediction Excellence**: The LTAGE predictor achieves perfect accuracy across all configurations, indicating sophisticated prediction mechanisms that handle the workload's branch patterns effectively.
**Workload-Dependent Scaling**: The scaling behavior is primarily determined by workload characteristics rather than architectural limitations, highlighting the importance of workload design in multi-core performance evaluation.
## 6. References
Hennessy, J. L., & Patterson, D. A. (2019). *Computer architecture: A quantitative approach* (6th ed.). Morgan Kaufmann.
Vaithianathan, M. (2021). The future of heterogeneous computing: Integrating CPUs, GPUs, and FPGAs for high-performance applications. *International Journal of Emerging Technologies in Computer Science and Information Technology*, 1(1), 102-115.
---
*Report generated from gem5 simulation results*
*Analysis date: [Current Date]*
*Simulation configurations: ST1, CMP2, CMP4*

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
processor : 0
vendor_id : Generic
cpu family : 0
model : 0
model name : Generic
stepping : 0
cpu MHz : 2000.000
cache size: : 1024.0K
physical id : 0
siblings : 1
core id : 0
cpu cores : 1
fpu : yes
fpu exception : yes
cpuid level : 1
wp : yes
flags : fpu
cache alignment : 64

View File

@@ -0,0 +1,2 @@
cpu 0 0 0 0 0 0 0
cpu0 0 0 0 0 0 0 0

View File

@@ -0,0 +1 @@
0-0

View File

@@ -0,0 +1 @@
0-0

13
multiThreading/ST1/simerr Normal file
View File

@@ -0,0 +1,13 @@
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
system.remote_gdb: Listening for connections on port 7000
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)

12
multiThreading/ST1/simout Normal file
View File

@@ -0,0 +1,12 @@
Global frequency set at 1000000000000 ticks per second
gem5 Simulator System. https://www.gem5.org
gem5 is copyrighted software; use the --copyright option for details.
gem5 version 23.0.0.1
gem5 compiled Aug 28 2025 18:18:37
gem5 started Sep 21 2025 03:49:56
gem5 executing on cargdevgpu, pid 3111147
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/ST1 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py --cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch --cpu-type=DerivO3CPU --num-cpus=1 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
**** REAL SIMULATION ****
Exiting @ tick 209718110000 because a thread reached the max instruction count

1432
multiThreading/ST1/stats.txt Normal file

File diff suppressed because it is too large Load Diff

98
multiThreading/parse_smt.sh Executable file
View File

@@ -0,0 +1,98 @@
#!/bin/bash
set -eu
# Configuration
ROOT="${1:-${PWD}/results}"
OUTPUT_FILE="${2:-}"
# Function to print header
print_header() {
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
"Config" "IPC" "CPI" "L1D MPKI" "L1I MPKI" "L2 MPKI" "Br MPKI" "Cache Util%" "Per-thread Stats"
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
"------" "---" "---" "---------" "---------" "---------" "---------" "---------" "---------------"
}
# Function to analyze a single configuration
analyze_config() {
local D="$1"
local S="$D/stats.txt"
local Cfg=$(basename "$D")
if [ ! -s "$S" ]; then
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
"$Cfg" "-" "-" "-" "-" "-" "-" "-" "-" "RUNNING/EMPTY"
return
fi
awk -v CFG="$Cfg" '
BEGIN {
I=C=Dm=Im=L2m=Bm=Bl=0
L1D_hits=L1D_misses=L1I_hits=L1I_misses=L2_hits=L2_misses=0
delete T
}
/^simInsts/ {I=$2}
/system\.cpu\.numCycles/ {C=$2}
/system\.l1d\.overall_misses::total/ {Dm=$2}
/system\.l1i\.overall_misses::total/ {Im=$2}
/system\.l2\.overall_misses::total/ {L2m=$2}
/branchPred\.mispredictions/ {Bm=$2}
/branchPred\.lookups/ {Bl=$2}
/system\.l1d\.overall_hits::total/ {L1D_hits=$2}
/system\.l1i\.overall_hits::total/ {L1I_hits=$2}
/system\.l2\.overall_hits::total/ {L2_hits=$2}
/commit\.committedInsts::([0-9]+)/ {tid=$1; gsub(/.*::/,"",tid); T[tid]=$2}
END{
# Calculate metrics
ipc=(C>0)? I/C : 0;
cpi=(I>0)? C/I : 0;
dmpki=(I>0)? 1000*Dm/I : 0;
impki=(I>0)? 1000*Im/I : 0;
l2mpki=(I>0)? 1000*L2m/I : 0;
bmpki=(I>0)? 1000*Bm/I : 0;
# Calculate cache utilization
l1d_total=L1D_hits+L1D_misses;
l1d_util=(l1d_total>0)? (L1D_hits/l1d_total)*100 : 0;
# Format per-thread counts
out="";
thread_count=0;
for (t in T) {
if (thread_count>0) out = out " ";
out = out "t" t "=" T[t];
thread_count++;
}
if (thread_count==0) out="single-thread";
printf "%-6s %8.3f %12.2f %12.2f %12.2f %12.2f %12.2f %12.1f %s\n",
CFG, ipc, cpi, dmpki, impki, l2mpki, bmpki, l1d_util, out;
}' "$S"
}
# Main execution
if [ -n "$OUTPUT_FILE" ]; then
exec > "$OUTPUT_FILE"
fi
echo "SMT Performance Analysis Report"
echo "Generated: $(date)"
echo "Results directory: $ROOT"
echo ""
print_header
# Process all configuration directories
for D in "$ROOT"/*; do
[ -d "$D" ] || continue
analyze_config "$D"
done | sort
echo ""
echo "Legend:"
echo " IPC = Instructions Per Cycle"
echo " CPI = Cycles Per Instruction"
echo " MPKI = Misses Per Kilo Instructions"
echo " Cache Util% = L1D Cache Hit Rate"
echo " Per-thread Stats = Instructions committed per thread"

52
multiThreading/run_cmp.sh Normal file
View File

@@ -0,0 +1,52 @@
#!/usr/bin/env sh
set -eu
# Paths (your setup)
GEM5=/home/carlos/projects/gem5/gem5src/gem5
BIN="$GEM5/build/X86/gem5.opt"
SE="$GEM5/configs/deprecated/example/se.py" # SMT broken here; CMP is fine
# Workloads (one per core)
CMD1=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch
CMD2=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch
CMD3=/bin/ls
CMD4=/bin/echo
ROOT=/home/carlos/projects/gem5/gem5-data/results/smt
mkdir -p "$ROOT"
BP=LTAGE
MAXI=20000000 # 20M insts per experiment
L1I=32kB; L1D=32kB; L2=1MB
run_cfg () {
NAME=$1
NCPUS=$2
CMDS=$3
OUT="$ROOT/$NAME"
mkdir -p "$OUT"
echo "[*] $NAME -> $OUT"
"$BIN" --outdir="$OUT" \
"$SE" \
--cmd="$CMDS" \
--cpu-type=DerivO3CPU \
--num-cpus="$NCPUS" \
--caches --l2cache \
--l1i_size="$L1I" --l1d_size="$L1D" --l2_size="$L2" \
--bp-type="$BP" \
--maxinsts="$MAXI" \
> "$OUT/simout" 2> "$OUT/simerr"
}
# ST1: 1 core (baseline)
run_cfg ST1 1 "$CMD1"
# CMP2: 2 cores, shared L2 (parallelism via cores)
# Note: pass two commands separated by ';' (se.py maps one per CPU)
run_cfg CMP2 2 "$CMD1;$CMD2"
# CMP4: 4 cores, shared L2
run_cfg CMP4 4 "$CMD1;$CMD2;$CMD3;$CMD4"
echo "[*] CMP sweep complete."