diff --git a/inference_benchmarks/inference_metrics.csv b/inference_benchmarks/inference_metrics.csv index 192235b..7c59d96 100644 --- a/inference_benchmarks/inference_metrics.csv +++ b/inference_benchmarks/inference_metrics.csv @@ -599,3 +599,5 @@ run_1763526867_non_optimized,1763526867.6077204,False,105,0,0.000243425369262695 run_1763526867_optimized,1763526867.7184992,True,105,0,0.10547971725463867,0,0,86.33642578125,,cuda run_1763526870_non_optimized,1763526870.5837529,False,110,0,0.00023603439331054688,0,0,73.24609375,,cuda run_1763526870_optimized,1763526870.6944253,True,110,0,0.10529303550720215,0,0,86.57470703125,,cuda +run_1763936322_non_optimized,1763936322.5285127,False,53,47,0.34784841537475586,135.11632631519785,7.401030114356508,83.77001953125,,cuda +run_1763936322_optimized,1763936322.6566308,True,53,47,0.12265253067016602,383.1963331143258,2.609628312131192,86.13916015625,,cuda diff --git a/inference_benchmarks/inference_metrics.json b/inference_benchmarks/inference_metrics.json index 45fec15..ebcc141 100644 --- a/inference_benchmarks/inference_metrics.json +++ b/inference_benchmarks/inference_metrics.json @@ -7799,6 +7799,45 @@ "memory_used_mb": 86.57470703125, "gpu_utilization": null, "device": "cuda" + }, + { + "run_name": "run_1763936322_non_optimized", + "timestamp": 1763936322.5285127, + "optimized": false, + "prompt_length": 53, + "generated_length": 47, + "total_time": 0.34784841537475586, + "tokens_per_second": 135.11632631519785, + "time_per_token": 7.401030114356508, + "memory_used_mb": 83.77001953125, + "gpu_utilization": null, + "device": "cuda" + }, + { + "run_name": "run_1763936322_optimized", + "timestamp": 1763936322.6566308, + "optimized": true, + "prompt_length": 53, + "generated_length": 47, + "total_time": 0.12265253067016602, + "tokens_per_second": 383.1963331143258, + "time_per_token": 2.609628312131192, + "memory_used_mb": 86.13916015625, + "gpu_utilization": null, + "device": "cuda" + }, + { + "run_name": "run_1763936325_non_optimized", + "timestamp": 1763936325.8276925, + "optimized": false, + "prompt_length": 66, + "generated_length": 34, + "total_time": 0.3239288330078125, + "tokens_per_second": 104.96132648735221, + "time_per_token": 9.527318617876839, + "memory_used_mb": 83.7705078125, + "gpu_utilization": null, + "device": "cuda" } ] } \ No newline at end of file diff --git a/inference_benchmarks/optimization_comparison.png b/inference_benchmarks/optimization_comparison.png index fc86ed4..bdf6079 100644 Binary files a/inference_benchmarks/optimization_comparison.png and b/inference_benchmarks/optimization_comparison.png differ diff --git a/inference_benchmarks/performance_over_time.png b/inference_benchmarks/performance_over_time.png index 45a653b..81dfe49 100644 Binary files a/inference_benchmarks/performance_over_time.png and b/inference_benchmarks/performance_over_time.png differ