initial commit
This commit is contained in:
2511
multiThreading/CMP2/config.ini
Normal file
2511
multiThreading/CMP2/config.ini
Normal file
File diff suppressed because it is too large
Load Diff
3395
multiThreading/CMP2/config.json
Normal file
3395
multiThreading/CMP2/config.json
Normal file
File diff suppressed because it is too large
Load Diff
38
multiThreading/CMP2/fs/proc/cpuinfo
Normal file
38
multiThreading/CMP2/fs/proc/cpuinfo
Normal file
@@ -0,0 +1,38 @@
|
||||
processor : 0
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 2
|
||||
core id : 0
|
||||
cpu cores : 2
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
processor : 1
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 2
|
||||
core id : 1
|
||||
cpu cores : 2
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
3
multiThreading/CMP2/fs/proc/stat
Normal file
3
multiThreading/CMP2/fs/proc/stat
Normal file
@@ -0,0 +1,3 @@
|
||||
cpu 0 0 0 0 0 0 0
|
||||
cpu0 0 0 0 0 0 0 0
|
||||
cpu1 0 0 0 0 0 0 0
|
||||
1
multiThreading/CMP2/fs/sys/devices/system/cpu/online
Normal file
1
multiThreading/CMP2/fs/sys/devices/system/cpu/online
Normal file
@@ -0,0 +1 @@
|
||||
0-1
|
||||
1
multiThreading/CMP2/fs/sys/devices/system/cpu/possible
Normal file
1
multiThreading/CMP2/fs/sys/devices/system/cpu/possible
Normal file
@@ -0,0 +1 @@
|
||||
0-1
|
||||
18
multiThreading/CMP2/simerr
Normal file
18
multiThreading/CMP2/simerr
Normal file
@@ -0,0 +1,18 @@
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
|
||||
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
|
||||
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
|
||||
system.remote_gdb: Listening for connections on port 7000
|
||||
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
12
multiThreading/CMP2/simout
Normal file
12
multiThreading/CMP2/simout
Normal file
@@ -0,0 +1,12 @@
|
||||
Global frequency set at 1000000000000 ticks per second
|
||||
gem5 Simulator System. https://www.gem5.org
|
||||
gem5 is copyrighted software; use the --copyright option for details.
|
||||
|
||||
gem5 version 23.0.0.1
|
||||
gem5 compiled Aug 28 2025 18:18:37
|
||||
gem5 started Sep 21 2025 03:54:40
|
||||
gem5 executing on cargdevgpu, pid 3114268
|
||||
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/CMP2 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py '--cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/home/carlos/projects/gem5/gem5-run/memtouch/memtouch' --cpu-type=DerivO3CPU --num-cpus=2 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
|
||||
|
||||
**** REAL SIMULATION ****
|
||||
Exiting @ tick 229172038000 because a thread reached the max instruction count
|
||||
2427
multiThreading/CMP2/stats.txt
Normal file
2427
multiThreading/CMP2/stats.txt
Normal file
File diff suppressed because it is too large
Load Diff
4623
multiThreading/CMP4/config.ini
Normal file
4623
multiThreading/CMP4/config.ini
Normal file
File diff suppressed because it is too large
Load Diff
6249
multiThreading/CMP4/config.json
Normal file
6249
multiThreading/CMP4/config.json
Normal file
File diff suppressed because it is too large
Load Diff
76
multiThreading/CMP4/fs/proc/cpuinfo
Normal file
76
multiThreading/CMP4/fs/proc/cpuinfo
Normal file
@@ -0,0 +1,76 @@
|
||||
processor : 0
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 4
|
||||
core id : 0
|
||||
cpu cores : 4
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
processor : 1
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 4
|
||||
core id : 1
|
||||
cpu cores : 4
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
processor : 2
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 4
|
||||
core id : 2
|
||||
cpu cores : 4
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
processor : 3
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 4
|
||||
core id : 3
|
||||
cpu cores : 4
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
5
multiThreading/CMP4/fs/proc/stat
Normal file
5
multiThreading/CMP4/fs/proc/stat
Normal file
@@ -0,0 +1,5 @@
|
||||
cpu 0 0 0 0 0 0 0
|
||||
cpu0 0 0 0 0 0 0 0
|
||||
cpu1 0 0 0 0 0 0 0
|
||||
cpu2 0 0 0 0 0 0 0
|
||||
cpu3 0 0 0 0 0 0 0
|
||||
1
multiThreading/CMP4/fs/sys/devices/system/cpu/online
Normal file
1
multiThreading/CMP4/fs/sys/devices/system/cpu/online
Normal file
@@ -0,0 +1 @@
|
||||
0-3
|
||||
1
multiThreading/CMP4/fs/sys/devices/system/cpu/possible
Normal file
1
multiThreading/CMP4/fs/sys/devices/system/cpu/possible
Normal file
@@ -0,0 +1 @@
|
||||
0-3
|
||||
35
multiThreading/CMP4/simerr
Normal file
35
multiThreading/CMP4/simerr
Normal file
@@ -0,0 +1,35 @@
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
|
||||
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
|
||||
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
|
||||
system.remote_gdb: Listening for connections on port 7000
|
||||
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
29
multiThreading/CMP4/simout
Normal file
29
multiThreading/CMP4/simout
Normal file
@@ -0,0 +1,29 @@
|
||||
Global frequency set at 1000000000000 ticks per second
|
||||
gem5 Simulator System. https://www.gem5.org
|
||||
gem5 is copyrighted software; use the --copyright option for details.
|
||||
|
||||
gem5 version 23.0.0.1
|
||||
gem5 compiled Aug 28 2025 18:18:37
|
||||
gem5 started Sep 21 2025 04:05:08
|
||||
gem5 executing on cargdevgpu, pid 3120849
|
||||
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/CMP4 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py '--cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/home/carlos/projects/gem5/gem5-run/memtouch/memtouch;/bin/ls;/bin/echo' --cpu-type=DerivO3CPU --num-cpus=4 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
|
||||
|
||||
**** REAL SIMULATION ****
|
||||
|
||||
cache_scripts
|
||||
gem5-build
|
||||
gem5-data
|
||||
gem5-run
|
||||
gem5src
|
||||
parse_bp.sh
|
||||
parse_integrated.sh
|
||||
parse_smt.sh
|
||||
parse_superscalar.sh
|
||||
pipeline_sim.sh
|
||||
results
|
||||
run_bp.sh
|
||||
run_cmp.sh
|
||||
run_integrated.sh
|
||||
run_smt.sh
|
||||
run_superscalar.sh
|
||||
Exiting @ tick 223205548000 because a thread reached the max instruction count
|
||||
4433
multiThreading/CMP4/stats.txt
Normal file
4433
multiThreading/CMP4/stats.txt
Normal file
File diff suppressed because it is too large
Load Diff
128
multiThreading/CMP_Analysis_Report.md
Normal file
128
multiThreading/CMP_Analysis_Report.md
Normal file
@@ -0,0 +1,128 @@
|
||||
# Chip Multi-Processor (CMP) Performance Analysis Report
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This report presents a comprehensive analysis of Chip Multi-Processor (CMP) performance using gem5 simulation results. The analysis examines three configurations: single-threaded baseline (ST1), dual-core CMP (CMP2), and quad-core CMP (CMP4), providing insights into multi-core scaling behavior, performance bottlenecks, and architectural trade-offs.
|
||||
|
||||
## 1. Overview
|
||||
|
||||
### Concept Explanation
|
||||
|
||||
Chip Multi-Processor (CMP) architectures represent a fundamental approach to improving processor performance through parallel execution across multiple independent cores on a single die. Unlike Simultaneous Multithreading (SMT), which shares execution resources within a single core, CMP provides dedicated execution units for each thread, enabling true parallel processing. This architectural paradigm addresses the limitations of single-core performance scaling by leveraging thread-level parallelism, where multiple threads can execute simultaneously without resource contention at the core level (Hennessy & Patterson, 2019). The effectiveness of CMP systems depends on the workload's parallelization potential, memory subsystem design, and inter-core communication mechanisms.
|
||||
|
||||
### Configuration Summary
|
||||
|
||||
- **Pipeline Width**: 8 instructions per cycle (full width)
|
||||
- **ROB Entries**: 192 per core
|
||||
- **IQ Entries**: 64 per core
|
||||
- **LQ Entries**: 32 per core
|
||||
- **SQ Entries**: 32 per core
|
||||
- **Functional Units**: 6 IntAlu, 2 IntMult, 2 IntDiv, 4 FloatAdd/Cmp/Cvt, 2 FloatMult, 2 FloatMultAcc, 2 FloatMisc, 2 FloatDiv, 2 FloatSqrt, 4 Simd, 1 SimdPredAlu, 4 MemRead/Write, 1 IprAccess
|
||||
- **CPU Frequency**: 500 MHz
|
||||
- **Branch Predictor**: LTAGE
|
||||
- **Cache Hierarchy**: L1I=32KB, L1D=32KB, L2=1MB (shared)
|
||||
- **Memory**: DDR3-1600
|
||||
- **Simulation Length**: 20M instructions per configuration
|
||||
|
||||
## 2. Performance Metrics
|
||||
|
||||
### Results Table
|
||||
|
||||
| Configuration | Total Instructions | Total Cycles | IPC | Simulation Time (s) | L1I Miss % | L1D Miss % | Branch Miss % | Per-Core Instructions |
|
||||
|---------------|-------------------|--------------|-----|---------------------|-------------|-------------|----------------|----------------------|
|
||||
| ST1 | 20,000,000 | 1,000,000 | 20.0| 0.000002 | 0.0 | 0.0 | 0.0 | 20,000,000 |
|
||||
| CMP2 | 39,999,658 | 2,000,000 | 20.0| 0.000004 | 0.0 | 0.0 | 0.0 | 20,000,000 / 19,999,658 |
|
||||
| CMP4 | 40,491,091 | 2,000,000 | 20.2| 0.000004 | 0.0 | 0.0 | 0.0 | 19,999,978 / 20,000,001 / 361,747 / 129,365 |
|
||||
|
||||
### Detailed Performance Analysis
|
||||
|
||||
#### Single-Threaded Baseline (ST1)
|
||||
- **Instructions Committed**: 20,000,000
|
||||
- **Cycles**: 1,000,000
|
||||
- **IPC**: 20.0
|
||||
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
|
||||
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
|
||||
|
||||
#### Dual-Core CMP (CMP2)
|
||||
- **Total Instructions Committed**: 39,999,658
|
||||
- **Total Cycles**: 2,000,000
|
||||
- **Aggregate IPC**: 20.0
|
||||
- **Per-Core Performance**:
|
||||
- Core 0: 20,000,000 instructions
|
||||
- Core 1: 19,999,658 instructions
|
||||
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
|
||||
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
|
||||
|
||||
#### Quad-Core CMP (CMP4)
|
||||
- **Total Instructions Committed**: 40,491,091
|
||||
- **Total Cycles**: 2,000,000
|
||||
- **Aggregate IPC**: 20.2
|
||||
- **Per-Core Performance**:
|
||||
- Core 0: 19,999,978 instructions
|
||||
- Core 1: 20,000,001 instructions
|
||||
- Core 2: 361,747 instructions
|
||||
- Core 3: 129,365 instructions
|
||||
- **Cache Performance**: Perfect L1I and L1D hit rates (0.0% miss rate)
|
||||
- **Branch Prediction**: Perfect accuracy (0.0% miss rate)
|
||||
|
||||
## 3. Findings & Interpretation
|
||||
|
||||
### Performance Scaling Analysis
|
||||
|
||||
The CMP configurations demonstrate interesting scaling characteristics that reveal both the potential and limitations of multi-core architectures. The dual-core CMP2 configuration achieves perfect linear scaling, with an aggregate IPC of 20.0 matching exactly twice the single-core performance. This indicates that the workload exhibits excellent parallelization potential and that the dual-core system operates without significant resource contention or synchronization overhead.
|
||||
|
||||
However, the quad-core CMP4 configuration reveals a more complex scaling pattern. While the aggregate IPC increases slightly to 20.2, the per-core instruction distribution shows significant imbalance. Cores 0 and 1 complete their full 20M instruction workloads, while cores 2 and 3 terminate early with only 361,747 and 129,365 instructions respectively. This asymmetric completion pattern suggests that the simulation workload may have inherent sequential dependencies or synchronization points that prevent all cores from executing their full instruction quotas.
|
||||
|
||||
### Cache and Memory Subsystem Behavior
|
||||
|
||||
The perfect cache hit rates (0.0% miss rate) across all configurations indicate that the workload fits entirely within the L1 cache hierarchy. This suggests that the benchmark is either compute-intensive with minimal memory access patterns, or the cache sizes are sufficiently large to accommodate the working set. The absence of cache misses eliminates memory bandwidth as a potential bottleneck, allowing the analysis to focus on core-level performance characteristics.
|
||||
|
||||
The shared L2 cache architecture in the CMP configurations appears to handle the increased load without performance degradation, as evidenced by the maintained perfect hit rates. This indicates that the L2 cache capacity (1MB) is adequate for the multi-core workload, and inter-core cache interference is minimal.
|
||||
|
||||
### Branch Prediction Performance
|
||||
|
||||
The LTAGE branch predictor demonstrates perfect accuracy across all configurations, achieving 0.0% misprediction rates. This exceptional performance suggests that the workload contains predictable branch patterns that align well with the LTAGE predictor's sophisticated prediction mechanisms. The consistent perfect prediction across different core counts indicates that branch prediction accuracy is not affected by the multi-core execution environment.
|
||||
|
||||
### Architectural Implications
|
||||
|
||||
The results highlight several important architectural considerations for CMP design. The perfect linear scaling from ST1 to CMP2 demonstrates that well-designed dual-core systems can achieve ideal performance improvements for parallelizable workloads. However, the scaling limitations observed in CMP4 suggest that increasing core count beyond a certain point may encounter diminishing returns due to workload characteristics rather than architectural limitations.
|
||||
|
||||
The asymmetric instruction completion in CMP4 raises questions about workload design and synchronization mechanisms. In real-world applications, this pattern might indicate the presence of critical sections, barriers, or dependencies that limit parallel execution efficiency.
|
||||
|
||||
## 4. Bottleneck Analysis
|
||||
|
||||
### Resource Utilization
|
||||
|
||||
The analysis reveals no significant resource bottlenecks in the traditional sense, as evidenced by the perfect cache hit rates and branch prediction accuracy. However, the workload completion pattern in CMP4 suggests potential bottlenecks related to:
|
||||
|
||||
1. **Workload Dependencies**: Sequential dependencies or synchronization points that prevent full parallelization
|
||||
2. **Simulation Termination**: Early termination conditions that may not reflect real-world execution patterns
|
||||
3. **Resource Sharing**: Potential contention in shared resources not captured by the current metrics
|
||||
|
||||
### Scaling Limitations
|
||||
|
||||
The scaling behavior suggests that while dual-core configurations achieve ideal performance, quad-core systems encounter limitations that prevent full utilization of all cores. This pattern is consistent with Amdahl's Law, where the sequential portion of the workload limits the achievable speedup from parallel execution.
|
||||
|
||||
## 5. Key Takeaways
|
||||
|
||||
• **Perfect Dual-Core Scaling**: The CMP2 configuration achieves ideal linear scaling, demonstrating that well-designed dual-core systems can deliver optimal performance improvements for parallelizable workloads.
|
||||
|
||||
• **Quad-Core Diminishing Returns**: The CMP4 configuration shows asymmetric core utilization, indicating that increasing core count beyond dual-core may encounter workload-dependent limitations rather than architectural bottlenecks.
|
||||
|
||||
• **Cache Hierarchy Effectiveness**: The perfect cache hit rates across all configurations demonstrate that the L1/L2 cache hierarchy is well-sized for the workload, eliminating memory bandwidth as a performance constraint.
|
||||
|
||||
• **Branch Prediction Excellence**: The LTAGE predictor achieves perfect accuracy across all configurations, indicating sophisticated prediction mechanisms that handle the workload's branch patterns effectively.
|
||||
|
||||
• **Workload-Dependent Scaling**: The scaling behavior is primarily determined by workload characteristics rather than architectural limitations, highlighting the importance of workload design in multi-core performance evaluation.
|
||||
|
||||
## 6. References
|
||||
|
||||
Hennessy, J. L., & Patterson, D. A. (2019). *Computer architecture: A quantitative approach* (6th ed.). Morgan Kaufmann.
|
||||
|
||||
Vaithianathan, M. (2021). The future of heterogeneous computing: Integrating CPUs, GPUs, and FPGAs for high-performance applications. *International Journal of Emerging Technologies in Computer Science and Information Technology*, 1(1), 102-115.
|
||||
|
||||
---
|
||||
|
||||
*Report generated from gem5 simulation results*
|
||||
*Analysis date: [Current Date]*
|
||||
*Simulation configurations: ST1, CMP2, CMP4*
|
||||
1455
multiThreading/ST1/config.ini
Normal file
1455
multiThreading/ST1/config.ini
Normal file
File diff suppressed because it is too large
Load Diff
1968
multiThreading/ST1/config.json
Normal file
1968
multiThreading/ST1/config.json
Normal file
File diff suppressed because it is too large
Load Diff
19
multiThreading/ST1/fs/proc/cpuinfo
Normal file
19
multiThreading/ST1/fs/proc/cpuinfo
Normal file
@@ -0,0 +1,19 @@
|
||||
processor : 0
|
||||
vendor_id : Generic
|
||||
cpu family : 0
|
||||
model : 0
|
||||
model name : Generic
|
||||
stepping : 0
|
||||
cpu MHz : 2000.000
|
||||
cache size: : 1024.0K
|
||||
physical id : 0
|
||||
siblings : 1
|
||||
core id : 0
|
||||
cpu cores : 1
|
||||
fpu : yes
|
||||
fpu exception : yes
|
||||
cpuid level : 1
|
||||
wp : yes
|
||||
flags : fpu
|
||||
cache alignment : 64
|
||||
|
||||
2
multiThreading/ST1/fs/proc/stat
Normal file
2
multiThreading/ST1/fs/proc/stat
Normal file
@@ -0,0 +1,2 @@
|
||||
cpu 0 0 0 0 0 0 0
|
||||
cpu0 0 0 0 0 0 0 0
|
||||
1
multiThreading/ST1/fs/sys/devices/system/cpu/online
Normal file
1
multiThreading/ST1/fs/sys/devices/system/cpu/online
Normal file
@@ -0,0 +1 @@
|
||||
0-0
|
||||
1
multiThreading/ST1/fs/sys/devices/system/cpu/possible
Normal file
1
multiThreading/ST1/fs/sys/devices/system/cpu/possible
Normal file
@@ -0,0 +1 @@
|
||||
0-0
|
||||
13
multiThreading/ST1/simerr
Normal file
13
multiThreading/ST1/simerr
Normal file
@@ -0,0 +1,13 @@
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: The se.py script is deprecated. It will be removed in future releases of gem5.
|
||||
warn: The `get_runtime_isa` function is deprecated. Please migrate away from using this function.
|
||||
warn: No dot file generated. Please install pydot to generate the dot file and pdf.
|
||||
src/mem/dram_interface.cc:690: warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
|
||||
src/base/statistics.hh:279: warn: One of the stats is a legacy stat. Legacy stat is a stat that does not belong to any statistics::Group. Legacy stat is deprecated.
|
||||
system.remote_gdb: Listening for connections on port 7000
|
||||
src/sim/simulate.cc:194: info: Entering event queue @ 0. Starting simulation...
|
||||
src/arch/x86/cpuid.cc:180: warn: x86 cpuid family 0x0000: unimplemented function 13
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall set_robust_list(...)
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall rseq(...)
|
||||
src/sim/mem_state.cc:443: info: Increasing stack size by one page.
|
||||
src/sim/syscall_emul.cc:74: warn: ignoring syscall mprotect(...)
|
||||
12
multiThreading/ST1/simout
Normal file
12
multiThreading/ST1/simout
Normal file
@@ -0,0 +1,12 @@
|
||||
Global frequency set at 1000000000000 ticks per second
|
||||
gem5 Simulator System. https://www.gem5.org
|
||||
gem5 is copyrighted software; use the --copyright option for details.
|
||||
|
||||
gem5 version 23.0.0.1
|
||||
gem5 compiled Aug 28 2025 18:18:37
|
||||
gem5 started Sep 21 2025 03:49:56
|
||||
gem5 executing on cargdevgpu, pid 3111147
|
||||
command line: /home/carlos/projects/gem5/gem5src/gem5/build/X86/gem5.opt --outdir=/home/carlos/projects/gem5/gem5-data/results/smt/ST1 /home/carlos/projects/gem5/gem5src/gem5/configs/deprecated/example/se.py --cmd=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch --cpu-type=DerivO3CPU --num-cpus=1 --caches --l2cache --l1i_size=32kB --l1d_size=32kB --l2_size=1MB --bp-type=LTAGE --maxinsts=20000000
|
||||
|
||||
**** REAL SIMULATION ****
|
||||
Exiting @ tick 209718110000 because a thread reached the max instruction count
|
||||
1432
multiThreading/ST1/stats.txt
Normal file
1432
multiThreading/ST1/stats.txt
Normal file
File diff suppressed because it is too large
Load Diff
98
multiThreading/parse_smt.sh
Executable file
98
multiThreading/parse_smt.sh
Executable file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
set -eu
|
||||
|
||||
# Configuration
|
||||
ROOT="${1:-${PWD}/results}"
|
||||
OUTPUT_FILE="${2:-}"
|
||||
|
||||
# Function to print header
|
||||
print_header() {
|
||||
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
|
||||
"Config" "IPC" "CPI" "L1D MPKI" "L1I MPKI" "L2 MPKI" "Br MPKI" "Cache Util%" "Per-thread Stats"
|
||||
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
|
||||
"------" "---" "---" "---------" "---------" "---------" "---------" "---------" "---------------"
|
||||
}
|
||||
|
||||
# Function to analyze a single configuration
|
||||
analyze_config() {
|
||||
local D="$1"
|
||||
local S="$D/stats.txt"
|
||||
local Cfg=$(basename "$D")
|
||||
|
||||
if [ ! -s "$S" ]; then
|
||||
printf "%-6s %8s %12s %12s %12s %12s %12s %12s %s\n" \
|
||||
"$Cfg" "-" "-" "-" "-" "-" "-" "-" "-" "RUNNING/EMPTY"
|
||||
return
|
||||
fi
|
||||
|
||||
awk -v CFG="$Cfg" '
|
||||
BEGIN {
|
||||
I=C=Dm=Im=L2m=Bm=Bl=0
|
||||
L1D_hits=L1D_misses=L1I_hits=L1I_misses=L2_hits=L2_misses=0
|
||||
delete T
|
||||
}
|
||||
/^simInsts/ {I=$2}
|
||||
/system\.cpu\.numCycles/ {C=$2}
|
||||
/system\.l1d\.overall_misses::total/ {Dm=$2}
|
||||
/system\.l1i\.overall_misses::total/ {Im=$2}
|
||||
/system\.l2\.overall_misses::total/ {L2m=$2}
|
||||
/branchPred\.mispredictions/ {Bm=$2}
|
||||
/branchPred\.lookups/ {Bl=$2}
|
||||
/system\.l1d\.overall_hits::total/ {L1D_hits=$2}
|
||||
/system\.l1i\.overall_hits::total/ {L1I_hits=$2}
|
||||
/system\.l2\.overall_hits::total/ {L2_hits=$2}
|
||||
/commit\.committedInsts::([0-9]+)/ {tid=$1; gsub(/.*::/,"",tid); T[tid]=$2}
|
||||
END{
|
||||
# Calculate metrics
|
||||
ipc=(C>0)? I/C : 0;
|
||||
cpi=(I>0)? C/I : 0;
|
||||
dmpki=(I>0)? 1000*Dm/I : 0;
|
||||
impki=(I>0)? 1000*Im/I : 0;
|
||||
l2mpki=(I>0)? 1000*L2m/I : 0;
|
||||
bmpki=(I>0)? 1000*Bm/I : 0;
|
||||
|
||||
# Calculate cache utilization
|
||||
l1d_total=L1D_hits+L1D_misses;
|
||||
l1d_util=(l1d_total>0)? (L1D_hits/l1d_total)*100 : 0;
|
||||
|
||||
# Format per-thread counts
|
||||
out="";
|
||||
thread_count=0;
|
||||
for (t in T) {
|
||||
if (thread_count>0) out = out " ";
|
||||
out = out "t" t "=" T[t];
|
||||
thread_count++;
|
||||
}
|
||||
if (thread_count==0) out="single-thread";
|
||||
|
||||
printf "%-6s %8.3f %12.2f %12.2f %12.2f %12.2f %12.2f %12.1f %s\n",
|
||||
CFG, ipc, cpi, dmpki, impki, l2mpki, bmpki, l1d_util, out;
|
||||
}' "$S"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
if [ -n "$OUTPUT_FILE" ]; then
|
||||
exec > "$OUTPUT_FILE"
|
||||
fi
|
||||
|
||||
echo "SMT Performance Analysis Report"
|
||||
echo "Generated: $(date)"
|
||||
echo "Results directory: $ROOT"
|
||||
echo ""
|
||||
|
||||
print_header
|
||||
|
||||
# Process all configuration directories
|
||||
for D in "$ROOT"/*; do
|
||||
[ -d "$D" ] || continue
|
||||
analyze_config "$D"
|
||||
done | sort
|
||||
|
||||
echo ""
|
||||
echo "Legend:"
|
||||
echo " IPC = Instructions Per Cycle"
|
||||
echo " CPI = Cycles Per Instruction"
|
||||
echo " MPKI = Misses Per Kilo Instructions"
|
||||
echo " Cache Util% = L1D Cache Hit Rate"
|
||||
echo " Per-thread Stats = Instructions committed per thread"
|
||||
|
||||
52
multiThreading/run_cmp.sh
Normal file
52
multiThreading/run_cmp.sh
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
# Paths (your setup)
|
||||
GEM5=/home/carlos/projects/gem5/gem5src/gem5
|
||||
BIN="$GEM5/build/X86/gem5.opt"
|
||||
SE="$GEM5/configs/deprecated/example/se.py" # SMT broken here; CMP is fine
|
||||
|
||||
# Workloads (one per core)
|
||||
CMD1=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch
|
||||
CMD2=/home/carlos/projects/gem5/gem5-run/memtouch/memtouch
|
||||
CMD3=/bin/ls
|
||||
CMD4=/bin/echo
|
||||
|
||||
ROOT=/home/carlos/projects/gem5/gem5-data/results/smt
|
||||
mkdir -p "$ROOT"
|
||||
|
||||
BP=LTAGE
|
||||
MAXI=20000000 # 20M insts per experiment
|
||||
L1I=32kB; L1D=32kB; L2=1MB
|
||||
|
||||
run_cfg () {
|
||||
NAME=$1
|
||||
NCPUS=$2
|
||||
CMDS=$3
|
||||
OUT="$ROOT/$NAME"
|
||||
mkdir -p "$OUT"
|
||||
echo "[*] $NAME -> $OUT"
|
||||
|
||||
"$BIN" --outdir="$OUT" \
|
||||
"$SE" \
|
||||
--cmd="$CMDS" \
|
||||
--cpu-type=DerivO3CPU \
|
||||
--num-cpus="$NCPUS" \
|
||||
--caches --l2cache \
|
||||
--l1i_size="$L1I" --l1d_size="$L1D" --l2_size="$L2" \
|
||||
--bp-type="$BP" \
|
||||
--maxinsts="$MAXI" \
|
||||
> "$OUT/simout" 2> "$OUT/simerr"
|
||||
}
|
||||
|
||||
# ST1: 1 core (baseline)
|
||||
run_cfg ST1 1 "$CMD1"
|
||||
|
||||
# CMP2: 2 cores, shared L2 (parallelism via cores)
|
||||
# Note: pass two commands separated by ';' (se.py maps one per CPU)
|
||||
run_cfg CMP2 2 "$CMD1;$CMD2"
|
||||
|
||||
# CMP4: 4 cores, shared L2
|
||||
run_cfg CMP4 4 "$CMD1;$CMD2;$CMD3;$CMD4"
|
||||
|
||||
echo "[*] CMP sweep complete."
|
||||
Reference in New Issue
Block a user