Gemma-3 4b-it

Q4_K_M·4B params·GGUF

intelligence: see on Artificial Analysis →

checkpoint: unsloth/gemma-3-4b-it-GGUF:gemma-3-4b-it-Q4_K_M.gguf

All runs (134)


raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_4096_256	1	197.2	197.2	—	—	—	—	—	4096	256	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_1024_1024	1	189.9	189.9	—	—	—	—	—	1024	1024	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	tg_1024	1	189.8	189.8	—	—	—	—	—	—	1024	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_384_1152	1	189.0	189.0	—	—	—	—	—	384	1152	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_2048_768	1	189.0	189.0	—	—	—	—	—	2048	768	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_1024_16	1	189.0	189.0	—	—	—	—	—	1024	16	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	tg_512	1	188.7	188.7	—	—	—	—	—	—	512	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_64_1024	1	188.1	188.1	—	—	—	—	—	64	1024	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r3	chat	1	186.7	167.8	818.5	—	36ms	5.4	—	29	100	566ms	0.000 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	tg_128	1	186.4	186.4	—	—	—	—	—	—	128	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595	chat	1	186.2	170.5	835.9	—	37ms	5.4	—	29	100	569ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r2	chat	1	185.0	171.5	1133.8	—	26ms	5.4	—	29	100	583ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-450w	chat	1	184.7	167.9	788.5	—	38ms	5.4	—	29	100	573ms	0.000 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_1280_3072	1	183.8	183.8	—	—	—	—	—	1280	3072	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_16_1536	1	183.7	183.7	—	—	—	—	—	16	1536	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-350w-595-r2	chat	1	183.4	157.4	1047.1	—	29ms	5.5	—	29	100	635ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595	codegen	1	183.1	171.8	541.8	—	132ms	5.5	—	64	1000	5.82s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r3	codegen	1	182.6	171.6	555.0	—	119ms	5.5	—	64	1000	5.83s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-350w	chat	1	182.2	164.4	816.3	—	38ms	5.5	—	29	100	582ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r3	rag	1	181.9	101.9	1952.2	—	339ms	5.5	—	846	70	689ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595	rag	1	181.6	101.2	1976.0	—	392ms	5.5	—	846	70	779ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r2	codegen	1	181.5	172.2	2305.3	—	29ms	5.5	—	64	1000	5.81s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-450w	codegen	1	181.3	169.4	539.9	—	129ms	5.5	—	64	1000	5.90s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595	agent	1	181.3	157.6	2277.1	—	227ms	5.5	—	611	436	2.76s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595	agent	4	181.1	58.8	185.6	—	4.20s	5.5	—	611	436	6.28s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r3	agent	1	180.8	154.1	2265.1	—	228ms	5.5	—	611	436	2.78s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-450w	rag	1	180.6	95.5	1871.9	—	339ms	5.5	—	846	70	688ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r3	agent	4	180.6	68.9	183.2	—	3.46s	5.5	—	611	436	6.20s	0.000 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	mixed_2048_256	1	180.3	180.3	—	—	—	—	—	2048	256	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r2	rag	1	180.2	112.8	5385.3	—	231ms	5.5	—	846	70	674ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r2	agent	1	180.1	154.2	2550.4	—	203ms	5.6	—	611	436	2.76s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-450w	agent	1	180.1	152.8	2316.2	—	223ms	5.6	—	611	436	2.77s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-450w	agent	4	179.8	65.9	152.1	—	3.87s	5.6	—	611	436	6.77s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-350w-595-r2	codegen	1	179.8	171.5	2058.6	—	29ms	5.6	—	64	1000	5.83s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-450w-595-r2	agent	4	179.7	63.7	206.8	—	3.72s	5.6	—	611	436	5.99s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 300 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline	chat	1	179.1	160.0	807.9	—	38ms	5.6	—	29	100	597ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-350w-595-r2	rag	1	178.9	111.4	5753.7	—	235ms	5.6	—	846	70	649ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-350w	codegen	1	178.6	166.7	502.5	—	127ms	5.6	—	64	1000	6.00s	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_4096_256	1	177.7	177.7	—	—	—	—	—	4096	256	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-350w	rag	1	177.7	101.6	1932.8	—	387ms	5.6	—	846	70	792ms	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_2048_256	1	177.7	177.7	—	—	—	—	—	2048	256	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-350w	agent	1	177.7	153.2	2267.3	—	228ms	5.6	—	611	436	2.81s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-350w-595-r2	agent	1	177.4	154.2	2369.8	—	218ms	5.6	—	611	436	2.84s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-350w-595-r2	agent	4	177.3	73.4	169.2	—	4.01s	5.6	—	611	436	6.94s	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	tg_128	1	177.1	177.1	—	—	—	—	—	—	128	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 350 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-350w	agent	4	177.1	54.6	168.7	—	4.16s	5.6	—	611	436	6.39s	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	tg_512	1	176.5	176.5	—	—	—	—	—	—	512	—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_1024_16	1	176.2	176.2	—	—	—	—	—	1024	16	—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_2048_768	1	175.9	175.9	—	—	—	—	—	2048	768	—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_1024_1024	1	175.1	175.1	—	—	—	—	—	1024	1024	—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_64_1024	1	175.1	175.1	—	—	—	—	—	64	1024	—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	tg_1024	1	175.0	175.0	—	—	—	—	—	—	1024	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 300 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline	codegen	1	174.7	160.9	531.5	—	123ms	5.7	—	64	1000	6.21s	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_384_1152	1	174.7	174.7	—	—	—	—	—	384	1152	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 300 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline	rag	1	174.6	97.1	2050.5	—	335ms	5.7	—	846	70	701ms	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_16_1536	1	173.6	173.6	—	—	—	—	—	16	1536	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 300 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline	agent	4	172.6	60.8	153.6	—	4.20s	5.8	—	611	436	7.07s	0.000 GiB
legacy	stack comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp cuda-1a68ec9 (cuda)	baseline	chat	1	172.6	166.9	1377.8	—	21ms	5.8	—	29	100	595ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 300 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline	agent	1	172.5	143.7	2266.9	—	228ms	5.8	—	611	436	2.84s	0.000 GiB
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	mixed_1280_3072	1	171.1	171.1	—	—	—	—	—	1280	3072	—	—
legacy	stack comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp cuda-1a68ec9 (cuda)	baseline	codegen	1	169.5	168.7	2014.9	—	35ms	5.9	—	64	1000	5.93s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 250 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-250w	chat	1	169.2	149.8	807.8	—	37ms	5.9	—	29	100	645ms	0.000 GiB
legacy	stack comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp cuda-1a68ec9 (cuda)	baseline	rag	1	168.4	143.0	10212.2	—	83ms	5.9	—	846	67	532ms	0.000 GiB
legacy	stack comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp cuda-1a68ec9 (cuda)	baseline	agent	1	168.4	163.7	10203.9	—	60ms	5.9	—	611	376	2.29s	0.000 GiB
legacy	stack comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp cuda-1a68ec9 (cuda)	baseline	agent	4	168.3	66.4	212.6	—	3.87s	5.9	—	611	376	6.17s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 250 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-250w	codegen	1	164.9	155.2	499.9	—	131ms	6.1	—	64	1000	6.45s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 250 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-250w	rag	1	164.6	95.1	1923.9	—	328ms	6.1	—	846	70	702ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 250 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-250w	agent	4	163.5	63.7	155.6	—	4.37s	6.1	—	611	436	7.49s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 250 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-250w	agent	1	163.4	139.9	2172.2	—	238ms	6.1	—	611	436	3.12s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-200w-595-r2	chat	1	142.9	130.3	744.7	—	41ms	7.0	—	29	100	759ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-200w	chat	1	142.9	128.5	652.1	—	45ms	7.0	—	29	100	742ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-200w-595-r2	rag	1	138.1	84.7	1960.4	—	372ms	7.2	—	846	70	807ms	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-200w	rag	1	133.5	84.0	1908.2	—	340ms	7.5	—	846	70	803ms	0.000 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_4096_256	1	129.9	129.9	—	—	—	—	—	4096	256	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_1024_16	1	129.1	129.1	—	—	—	—	—	1024	16	—	—
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-200w	agent	1	128.7	111.1	2080.5	—	249ms	7.8	—	611	436	3.91s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-200w-595-r2	agent	1	128.4	111.2	2104.8	—	246ms	7.8	—	611	436	3.91s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-200w	agent	4	128.2	53.0	121.7	—	5.41s	7.8	—	611	436	9.51s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 590	llama.cpp cuda-4f13cb7 (cuda)	baseline-pl-200w	codegen	1	127.9	120.9	540.1	—	127ms	7.8	—	64	1000	8.27s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-200w-595-r2	agent	4	126.9	45.2	129.3	—	5.64s	7.9	—	611	436	9.51s	0.000 GiB
legacy	stack comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp cuda-3e12fbd (cuda)	baseline-pl-200w-595-r2	codegen	1	126.7	119.3	552.2	—	125ms	7.9	—	64	1000	8.38s	0.000 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	tg_128	1	122.9	122.9	—	—	—	—	—	—	128	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_2048_256	1	122.2	122.2	—	—	—	—	—	2048	256	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	tg_512	1	120.9	120.9	—	—	—	—	—	—	512	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_2048_768	1	120.0	120.0	—	—	—	—	—	2048	768	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	tg_1024	1	119.9	119.9	—	—	—	—	—	—	1024	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_64_1024	1	119.9	119.9	—	—	—	—	—	64	1024	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_1024_1024	1	119.9	119.9	—	—	—	—	—	1024	1024	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_384_1152	1	118.8	118.8	—	—	—	—	—	384	1152	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_16_1536	1	118.5	118.5	—	—	—	—	—	16	1536	—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	mixed_1280_3072	1	117.6	117.6	—	—	—	—	—	1280	3072	—	—
legacy	stack comparable	Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)unified	llama.cpp b1203 (rocm)	baseline	chat	1	66.3	64.2	—	—	59ms	15.1	—	—	100	1.56s	0.001 GiB
legacy	stack comparable	Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)unified	llama.cpp b1203 (rocm)	baseline	codegen	1	65.0	64.6	—	—	99ms	15.4	—	—	1000	15.47s	0.002 GiB
legacy	stack comparable	Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)unified	llama.cpp b1203 (rocm)	baseline	agent	1	64.5	61.3	—	—	426ms	15.5	—	—	354	5.97s	0.002 GiB
legacy	stack comparable	Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)unified	llama.cpp b1203 (rocm)	baseline	rag	1	63.7	55.5	—	—	325ms	15.7	—	—	67	1.60s	0.002 GiB
legacy	stack comparable	Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)unified	llama.cpp b1203 (rocm)	baseline	agent	4	22.2	17.8	—	—	3.36s	45.0	—	—	376	21.06s	0.001 GiB
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	pp_512	1	—	—	9361.9	—	—	—	—	512		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	pp_1024	1	—	—	9786.0	—	—	—	—	1024		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	pp_2048	1	—	—	9515.6	—	—	—	—	2048		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiB450 W maxdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2-pl450	pp_4096	1	—	—	9748.3	—	—	—	—	4096		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	pp_512	1	—	—	6459.8	—	—	—	—	512		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	pp_1024	1	—	—	6494.8	—	—	—	—	1024		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	pp_2048	1	—	—	5858.7	—	—	—	—	2048		—	—
raw	hardware comparable	GeForce RTX 3090 · 24 GiBcap 200 Wdrv 595	llama.cpp llama.cpp-3e12fbd (cuda)	raw-v4-r2	pp_4096	1	—	—	5699.6	—	—	—	—	4096		—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	pp_512	1	—	—	9445.5	—	—	—	—	512		—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	pp_1024	1	—	—	9711.4	—	—	—	—	1024		—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	pp_2048	1	—	—	9679.0	—	—	—	—	2048		—	—
raw	hardware comparable	GeForce RTX 5070 · 12 GiBcap 250 Wdrv 595	llama.cpp llama.cpp-1a68ec9 (cuda)	raw-v4-r2	pp_4096	1	—	—	9626.0	—	—	—	—	4096		—	—

Environment

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power200 W / 450 W max(44% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1965/2100 MHz · mem 9501 MHz

temp44°C idle · 46°C peak

peak draw196 W

hardware probes

copy 42% of theoryFP16 peak 65.4 TFcopy/math flat across caps

384-bit9751 MHz82 SM/CU

Microbenchmarks for memory copy and tensor math; raw-engine decode and API workload rows measure model-serving speed.

cap	theory	copy	fp16	bf16
200 W	936 GB/s	391 GB/s	65.4 TF	65.4 TF
300 W	936 GB/s	391 GB/s	65.4 TF	65.3 TF
450 W	936 GB/s	391 GB/s	65.4 TF	65.4 TF

compute: 8.6

backendllama.cpp cuda-4f13cb7 (cuda)

osUbuntu 24.04 LTS

kernel6.17.13-7-pve

driverNVIDIA 590.48.01 + CUDA 13.1

libc2.39

python3.12.3

llama.cppversion: 18 (4f13cb7) built with GNU 13.3.0 for Linux x86_64

build flagsGGML_CUDA=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power250 W / 450 W max(56% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1980/2100 MHz · mem 9501 MHz

temp47°C idle · 51°C peak

peak draw243 W

backendllama.cpp cuda-4f13cb7 (cuda)

osUbuntu 24.04 LTS

kernel6.17.13-7-pve

driverNVIDIA 590.48.01 + CUDA 13.1

libc2.39

python3.12.3

llama.cppversion: 18 (4f13cb7) built with GNU 13.3.0 for Linux x86_64

build flagsGGML_CUDA=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power350 W / 450 W max(78% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1980/2100 MHz · mem 9501 MHz

temp54°C idle · 62°C peak

peak draw335 W

backendllama.cpp cuda-4f13cb7 (cuda)

osUbuntu 24.04 LTS

kernel6.17.13-7-pve

driverNVIDIA 590.48.01 + CUDA 13.1

libc2.39

python3.12.3

llama.cppversion: 18 (4f13cb7) built with GNU 13.3.0 for Linux x86_64

build flagsGGML_CUDA=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power450 W / 450 W max

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1965/2100 MHz · mem 9501 MHz

temp60°C idle · 77°C peak

peak draw433 W

backendllama.cpp cuda-4f13cb7 (cuda)

osUbuntu 24.04 LTS

kernel6.17.13-7-pve

driverNVIDIA 590.48.01 + CUDA 13.1

libc2.39

python3.12.3

llama.cppversion: 18 (4f13cb7) built with GNU 13.3.0 for Linux x86_64

build flagsGGML_CUDA=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power200 W / 450 W max(44% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1800/2100 MHz · mem 9501 MHz

temp39°C idle · 44°C peak

peak draw196 W

backendllama.cpp cuda-3e12fbd (cuda)

osUbuntu 24.04 LTS

kernel7.0.2-4-pve

driverNVIDIA 595.71.05 + CUDA 13.2

libc2.39

python3.12.3

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power350 W / 450 W max(78% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1800/2100 MHz · mem 9501 MHz

temp43°C idle · 60°C peak

peak draw337 W

backendllama.cpp cuda-3e12fbd (cuda)

osUbuntu 24.04 LTS

kernel7.0.2-4-pve

driverNVIDIA 595.71.05 + CUDA 13.2

libc2.39

python3.12.3

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power450 W / 450 W max

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1800/2100 MHz · mem 9501 MHz

temp52°C idle · 72°C peak

peak draw424 W

backendllama.cpp cuda-3e12fbd (cuda)

osUbuntu 24.04 LTS

kernel7.0.2-4-pve

driverNVIDIA 595.71.05 + CUDA 13.2

libc2.39

python3.12.3

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power300 W / 450 W max(67% cap)

pcieGen 4 x16 / Gen 4 x16 max

clocksgfx 1950/2100 MHz · mem 9501 MHz

temp37°C idle · 64°C peak

peak draw291 W

backendllama.cpp cuda-4f13cb7 (cuda)

osUbuntu 24.04 LTS

kernel6.17.13-7-pve

driverNVIDIA 590.48.01 + CUDA 13.1

libc2.39

python3.12.3

llama.cppversion: 18 (4f13cb7) built with GNU 13.3.0 for Linux x86_64

build flagsGGML_CUDA=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 5070 · 12 GiB

cpuAMD Ryzen 9 7900 12-Core Processor

gpuNVIDIA GeForce RTX 5070

archNVIDIA

vram11.94 GiB (system 30.5 GiB)

power250 W / 300 W max(83% cap)

pcieGen 1 x16 / Gen 4 x16 max

clocksgfx 180/3090 MHz · mem 405 MHz

temp31°C idle · 64°C peak

peak draw194 W

hardware probes

copy 40% of theoryFP16 peak 69.6 TFcopy/math spread 2.5%

192-bit14001 MHz48 SM/CU

Microbenchmarks for memory copy and tensor math; raw-engine decode and API workload rows measure model-serving speed.

cap	theory	copy	fp16	bf16
200 W	672 GB/s	271 GB/s	67.9 TF	68.4 TF
250 W	672 GB/s	271 GB/s	69.5 TF	68.2 TF
300 W	672 GB/s	270 GB/s	69.6 TF	68.4 TF

compute: 12

backendllama.cpp cuda-1a68ec9 (cuda)

osCachyOS

kernel7.0.8-1-cachyos

driverNVIDIA 595.71.05 + CUDA 13.2

libc2.43

python3.14.4

build flagsGGML_CUDA=ON CMAKE_CUDA_ARCHITECTURES=120 CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

Strix Halo · Radeon 8060S · 128 GiB unified (96 GiB VRAM)

cpuAMD RYZEN AI MAX+ 395 w/ Radeon 8060S

gpuAMD Radeon 8060S

archStrix Halo (gfx1151)

vram96 GiB (system 31.1 GiB, unified)

hardware probes

copy 41% of theoryFP16 peak 30.3 TF

256-bit8000 MHz20 SM/CU

Microbenchmarks for memory copy and tensor math; raw-engine decode and API workload rows measure model-serving speed.

cap	theory	copy	fp16	bf16
fixed	256 GB/s	106 GB/s	30.3 TF	-

compute: 11.5

backendllama.cpp b1203 (rocm)

osUbuntu 24.04.4 LTS

kernel7.0.2-2-pve

python3.12.3

runs/cell3

warmups1

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 5070 · 12 GiB

cpuAMD Ryzen 9 7900 12-Core Processor

gpuNVIDIA GeForce RTX 5070

archNVIDIA

vram11.94 GiB (system 30.5 GiB)

power250 W / 300 W max(83% cap)

pcieGen 1 x16 / Gen 4 x16 max

clocksgfx 180/3090 MHz · mem 405 MHz

temp39°C idle · 62°C peak

peak draw175 W

backendllama.cpp vulkan-1a68ec9 (vulkan)

osCachyOS

kernel7.0.8-1-cachyos

driverNVIDIA 595.71.05 + CUDA 13.2

libc2.43

python3.14.4

llama.cppversion: 1 (1a68ec9) built with GNU 15.2.1 for Linux x86_64

build flagsGGML_VULKAN=ON CMAKE_BUILD_TYPE=Release

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 5070 · 12 GiB

cpuAMD Ryzen 9 7900 12-Core Processor

gpuNVIDIA GeForce RTX 5070

archNVIDIA

vram11.94 GiB (system 30.4 GiB)

power250 W / 300 W max(83% cap)

backendllama.cpp b9174 (vulkan)

osCachyOS

kernel7.0.0-1-cachyos

driver595.58.03

python3.14.4

runs/cell5

warmups2

endpoint/v1/chat/completions

streamingtrue

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power450 W / 450 W max

clocksgfx 210 MHz · mem 405 MHz

temp34°C idle · 34°C peak

peak draw24 W

backendllama.cpp llama.cpp-3e12fbd (cuda)

osUbuntu 24.04 LTS

kernel7.0.2-4-pve

driverNVIDIA 595.71.05 + CUDA 13.2

python3.12.3

runs/cell3

warmups0

endpointllama-bench

streamingfalse

GeForce RTX 3090 · 24 GiB

cpuAMD EPYC 7302P 16-Core Processor

gpuNVIDIA GeForce RTX 3090

archNVIDIA

vram24 GiB (system 64.0 GiB)

power200 W / 450 W max(44% cap)

clocksgfx 210 MHz · mem 405 MHz

temp40°C idle · 40°C peak

peak draw26 W

backendllama.cpp llama.cpp-3e12fbd (cuda)

osUbuntu 24.04 LTS

kernel7.0.2-4-pve

driverNVIDIA 595.71.05 + CUDA 13.2

python3.12.3

runs/cell3

warmups0

endpointllama-bench

streamingfalse

GeForce RTX 5070 · 12 GiB

cpuAMD Ryzen 9 7900 12-Core Processor

gpuNVIDIA GeForce RTX 5070

archNVIDIA

vram11.94 GiB (system 30.4 GiB)

power250 W / 300 W max(83% cap)

clocksgfx 180 MHz · mem 405 MHz

temp30°C idle · 30°C peak

peak draw1 W

backendllama.cpp llama.cpp-1a68ec9 (cuda)

osCachyOS

kernel7.0.8-1-cachyos

driverNVIDIA 595.71.05 + CUDA 13.2

python3.14.4

runs/cell3

warmups0

endpointllama-bench

streamingfalse

GeForce RTX 5070 · 12 GiB

cpuAMD Ryzen 9 7900 12-Core Processor

gpuNVIDIA GeForce RTX 5070

archNVIDIA

vram11.94 GiB (system 30.4 GiB)

power250 W / 300 W max(83% cap)

clocksgfx 180 MHz · mem 405 MHz

temp32°C idle · 32°C peak

peak draw2 W

backendllama.cpp llama.cpp-b9174 (vulkan)

osCachyOS

kernel7.0.8-1-cachyos

driverNVIDIA 595.71.05 + CUDA 13.2

python3.14.4

runs/cell3

warmups0

endpointllama-bench

streamingfalse