{
    "meta": {
        "title": "D-Central — AI Inference Accelerators Beyond GPUs",
        "description": "13 non-GPU local-AI inference accelerators (NPUs, edge TPUs, RISC-V AI cards, LPUs) with compute, memory model, an honest local-LLM suitability verdict, runtimes and power. Memory capacity/bandwidth, not TOPS, decides LLM fit.",
        "generated": "2026-06-21T14:06:52+00:00",
        "version": "1.0",
        "license": "https://creativecommons.org/licenses/by/4.0/",
        "license_name": "CC BY 4.0",
        "source": "https://d-central.tech/ai-inference-accelerators/",
        "record_count": 13,
        "scope": "NON-GPU inference silicon only. GPUs and Apple Silicon M-series are in the sibling AI-GPU database (/data/ai-gpu-database/) and are NOT re-listed here (one Apple pointer row connects the two).",
        "thesis": "Local LLM inference is bound by memory CAPACITY and BANDWIDTH, not raw TOPS. Dedicated edge NPU/TPU silicon with tiny on-chip SRAM and a vision-CNN heritage generally CANNOT run LLMs; unified-memory SoCs and DRAM-backed AI cards can.",
        "provenance": "Every TOPS/memory/price figure is vendor- or first-party-doc sourced (cited per row). The D-Central Mining Bible does not cover NPU/TPU/LPU silicon; manufacturer docs are the authority.",
        "disclaimer": "NPU TOPS, SDK/runtime support and prices move quarterly -- every row carries last_verified (2026-06); re-verify at source. Suitability verdicts reflect the memory-model reality, not vendor superlatives."
    },
    "rows": [
        {
            "id": "amd-ryzen-ai-max-plus-395",
            "accelerator": "AMD Ryzen AI Max+ 395 (Strix Halo)",
            "manufacturer": "AMD",
            "type": "unified-memory APU/SoC",
            "compute": "16 Zen 5 cores + 40 RDNA 3.5 iGPU CUs + XDNA 2 NPU ~50 TOPS (INT8)",
            "memory_model": "Up to 128 GB LPDDR5X-8000 unified; up to 96 GB assignable as VRAM (AMD Variable Graphics Memory)",
            "local_llm_suitability": "Excellent — runs Llama-70B Q8 on a single device via the iGPU + unified memory pool. NOTE: the 50-TOPS NPU is largely unused for general LLM inference as of mid-2026; the iGPU is the actual LLM path (NPU used only for specific accelerated paths via Lemonade SDK).",
            "runtime_support": "llama.cpp (Vulkan/ROCm), LM Studio, Ollama, AMD Lemonade SDK; OpenAI-compatible via Ollama/LM Studio server",
            "power": "Configurable ~45-120 W (mini-PC/laptop)",
            "notable": "Standout consumer single-box 70B-class LLM host; shipping in mini-PCs (GMKtec EVO-X2, Beelink GTR9 Pro, AMD Ryzen AI Halo dev platform).",
            "source": "https://www.amd.com/en/products/processors/laptop/ryzen/ai-300-series/amd-ryzen-ai-max-plus-395.html ; https://www.amd.com/en/blogs/2025/amd-ryzen-ai-max-395-processor-breakthrough-ai-.html",
            "last_verified": "2026-06"
        },
        {
            "id": "amd-ryzen-ai-300-strix-point-npu",
            "accelerator": "AMD Ryzen AI 300 (Strix Point) XDNA 2 NPU",
            "manufacturer": "AMD",
            "type": "NPU (in x86 laptop SoC)",
            "compute": "XDNA 2 NPU ~50 TOPS (INT8) + Zen 5 CPU + RDNA 3.5 iGPU",
            "memory_model": "Shared system LPDDR5X (dual-channel, typically 16-32 GB) — NOT the 256-bit wide pool of the Max+/Halo part",
            "local_llm_suitability": "Emerging — small models (Llama 3.1 8B, Phi 3.5 Mini) via Ryzen AI Software / Lemonade on the NPU; iGPU+RAM is the practical LLM path. Mainstream laptop tier, narrower memory than Strix Halo.",
            "runtime_support": "AMD Ryzen AI Software, Lemonade SDK, ONNX Runtime; llama.cpp on iGPU",
            "power": "~15-54 W laptop envelope",
            "notable": "Distinguish from Max+ 395: same NPU class, far less memory bandwidth/capacity — much weaker for large LLMs.",
            "source": "https://www.guru3d.com/story/ryzen-ai-395-technical-overview-zen-5-cores-and-xdna-2-ai-npu/ (XDNA 2 NPU TOPS); AMD Ryzen AI 300 series product pages",
            "last_verified": "2026-06"
        },
        {
            "id": "nvidia-jetson-agx-orin-64gb",
            "accelerator": "NVIDIA Jetson AGX Orin 64 GB",
            "manufacturer": "NVIDIA",
            "type": "edge SoC (Ampere GPU + ARM CPU + dual DLA)",
            "compute": "2048-core Ampere GPU + 64 Tensor cores + 12-core ARM; up to 275 TOPS (sparse INT8) / 170 dense INT8 TOPS; 5.3 FP32 TFLOPs",
            "memory_model": "64 GB 256-bit LPDDR5 unified, 204.8 GB/s",
            "local_llm_suitability": "Strong — runs 13B unquantized and larger quantized models via CUDA; large unified pool is the differentiator over smaller Jetsons.",
            "runtime_support": "llama.cpp, Ollama, NVIDIA TensorRT-LLM, MLC-LLM (full CUDA stack)",
            "power": "15 W / 30 W / 50 W presets, up to 60 W MAXN",
            "notable": "Credit-card-sized edge module; popular self-hosted always-on LLM/robotics node.",
            "source": "https://www.nvidia.com/content/dam/en-zz/Solutions/gtcf21/jetson-orin/nvidia-jetson-agx-orin-technical-brief.pdf ; https://www.storagereview.com/review/revisiting-the-nvidia-jetson-agx-orin-tiny-package-large-language-models",
            "last_verified": "2026-06"
        },
        {
            "id": "nvidia-jetson-orin-nano-super-8gb",
            "accelerator": "NVIDIA Jetson Orin Nano Super 8 GB",
            "manufacturer": "NVIDIA",
            "type": "edge SoC (Ampere GPU + ARM CPU)",
            "compute": "Up to 67 TOPS (INT8)",
            "memory_model": "8 GB LPDDR5 unified (also 4 GB variant)",
            "local_llm_suitability": "Limited — small 3B-8B quantized models only; 8 GB ceiling. Budget entry to the Jetson/CUDA LLM ecosystem.",
            "runtime_support": "llama.cpp, Ollama, MLC-LLM, TensorRT-LLM",
            "power": "7-25 W",
            "notable": "Cheapest CUDA-capable on-device LLM box; good for tinkering, not large models.",
            "source": "https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/ (Orin Nano series: up to 67 TOPS, 7-25 W, 4/8 GB)",
            "last_verified": "2026-06"
        },
        {
            "id": "qualcomm-snapdragon-x-elite-hexagon-npu",
            "accelerator": "Qualcomm Snapdragon X Elite (Hexagon NPU)",
            "manufacturer": "Qualcomm",
            "type": "NPU (in ARM laptop SoC)",
            "compute": "Hexagon NPU 45 TOPS (INT8) + 12-core Oryon ARMv9 CPU + Adreno GPU",
            "memory_model": "Unified LPDDR5X up to 64 GB (commonly 16 GB at 8533 MT/s)",
            "local_llm_suitability": "Good / Emerging — runs 8B-13B-class models on-device; NPU optimized for INT4/INT8 (not FP16/BF16). Software maturity is the gating factor.",
            "runtime_support": "llama.cpp, Ollama (CPU/NPU paths maturing); Qualcomm AI Engine / QNN",
            "power": "Very efficient laptop envelope (NPU is fraction of iGPU power)",
            "notable": "Windows-on-ARM AI PCs; community testing confirms usable power-efficient local LLMs on the NPU.",
            "source": "https://www.qualcomm.com/laptops/products/snapdragon-x-elite ; https://www.tweaktown.com/news/97822/ (45 TOPS NPU) ; https://www.xda-developers.com/these-llms-run-locally-snapdragon-x-elite-npu-surprisingly-good/",
            "last_verified": "2026-06"
        },
        {
            "id": "intel-core-ultra-series2-lunar-lake-npu",
            "accelerator": "Intel Core Ultra Series 2 (Lunar Lake) NPU",
            "manufacturer": "Intel",
            "type": "NPU (NPU 4, in x86 SoC)",
            "compute": "NPU up to 48 TOPS (INT8) + P/E cores + Xe2 (Arc) iGPU; combined platform ~120 TOPS",
            "memory_model": "On-package LPDDR5X (16 GB or 32 GB on Lunar Lake — capacity is the constraint)",
            "local_llm_suitability": "Emerging / Moderate — 7B-8B models via OpenVINO GenAI on NPU (NF4 supported); ~8 tok/s combining CPU+NPU+GPU. NPU draws ~2-3 W vs 15-25 W iGPU → best for efficiency, not peak throughput. Prompts >1024 tokens with >7B models may need >16 GB RAM.",
            "runtime_support": "Intel OpenVINO / OpenVINO GenAI (NPU plugin), WindowsML, DirectML, ONNX RT, WebNN",
            "power": "Laptop envelope; NPU ~2-3 W",
            "notable": "Strongest case is battery-efficient assistant loops, not large-model throughput.",
            "source": "https://docs.openvino.ai/2025/openvino-workflow-generative/inference-with-genai/inference-with-genai-on-npu.html ; https://www.intel.com/content/www/us/en/support/articles/000099574/ ; https://www.thurrott.com/hardware/303493/",
            "last_verified": "2026-06"
        },
        {
            "id": "tenstorrent-blackhole-p150a",
            "accelerator": "Tenstorrent Blackhole p150a",
            "manufacturer": "Tenstorrent",
            "type": "AI accelerator PCIe card (RISC-V Tensix)",
            "compute": "140 Tensix cores + 16 'big RISC-V' cores; 210 MB SRAM",
            "memory_model": "32 GB GDDR6 @ 512 GB/s (DRAM-backed)",
            "local_llm_suitability": "Good — purpose-built local LLM inference card; runs Llama/Qwen/Mistral/Mixtral/Falcon via Tenstorrent's open-source vLLM fork. QSFP-DD ports allow multi-card memory pooling.",
            "runtime_support": "Tenstorrent TT-Metalium / tt-vLLM (open source, OpenAI-compatible server); access 'down to the metal'",
            "power": "Up to 300 W, active-cooled",
            "notable": "~$1,399. Fully open-source software stack — aligns with the decentralization/sovereignty narrative; a credible non-NVIDIA local-LLM card.",
            "source": "https://tenstorrent.com/en/hardware/cards ; https://docs.tenstorrent.com/aibs/blackhole/specifications.html",
            "last_verified": "2026-06"
        },
        {
            "id": "tenstorrent-wormhole-n300d",
            "accelerator": "Tenstorrent Wormhole n300d",
            "manufacturer": "Tenstorrent",
            "type": "AI accelerator PCIe card (RISC-V Tensix, dual ASIC)",
            "compute": "2x Wormhole ASICs, 128 Tensix cores; 192 MB SRAM",
            "memory_model": "24 GB GDDR6 @ 576 GB/s (DRAM-backed)",
            "local_llm_suitability": "Good — prior-gen open-stack local LLM card; same TT software ecosystem as Blackhole.",
            "runtime_support": "TT-Metalium / tt-vLLM (open source, OpenAI-compatible)",
            "power": "Up to 300 W",
            "notable": "~$1,449. Predecessor to Blackhole (credit Tenstorrent's iterative open-hardware lineage).",
            "source": "https://docs.tenstorrent.com/aibs/wormhole/specifications.html ; https://tenstorrent.com/en/hardware/cards",
            "last_verified": "2026-06"
        },
        {
            "id": "hailo-10h",
            "accelerator": "Hailo-10H",
            "manufacturer": "Hailo",
            "type": "edge generative-AI accelerator (M.2)",
            "compute": "40 TOPS (INT4) / 20 TOPS (INT8), 2nd-gen neural core",
            "memory_model": "Direct DDR interface to on-module LPDDR4/4X, 4 GB or 8 GB",
            "local_llm_suitability": "Limited / Emerging — first Hailo part that CAN run small LLMs/VLMs/diffusion at the edge (the direct-DDR interface lifts the on-die-SRAM cap); capacity-limited to small models. Tested on Raspberry Pi AI HAT+ 2.",
            "runtime_support": "Hailo Dataflow Compiler / HailoRT SDK (vendor stack)",
            "power": "~2.5 W typical",
            "notable": "Genuinely runs gen-AI at the edge but is a small-model, vendor-SDK device — not a general OpenAI-API LLM host.",
            "source": "https://hailo.ai/products/ai-accelerators/hailo-10h-ai-accelerator/ ; https://www.cnx-software.com/2026/01/20/ (Raspberry Pi AI HAT+ 2 review)",
            "last_verified": "2026-06"
        },
        {
            "id": "hailo-8",
            "accelerator": "Hailo-8",
            "manufacturer": "Hailo",
            "type": "edge vision accelerator (M.2/PCIe)",
            "compute": "Up to 26 TOPS (INT8)",
            "memory_model": "All weights on-die SRAM — NO external memory interface (hard cap on model size)",
            "local_llm_suitability": "Not suitable (vision/CNN only) — no DRAM path; cannot hold LLM weights. Designed for vision networks. Included to correct a common misconception.",
            "runtime_support": "HailoRT (vision pipelines: detection/segmentation/classification)",
            "power": "Low single-digit W",
            "notable": "Market-leading edge vision accelerator; NOT an LLM device. The Hailo-10H (above) is the gen-AI successor.",
            "source": "https://hailo.ai/products/ai-accelerators/hailo-8-ai-accelerator/",
            "last_verified": "2026-06"
        },
        {
            "id": "google-coral-edge-tpu",
            "accelerator": "Google Coral Edge TPU (USB / M.2 / Dev Board)",
            "manufacturer": "Google",
            "type": "edge TPU (vision coprocessor)",
            "compute": "4 TOPS (INT8), 2 TOPS/W",
            "memory_model": "~8 MB on-chip SRAM; TensorFlow Lite INT8 models only",
            "local_llm_suitability": "Not suitable (vision/CNN only) — built for the convolutional vision era (e.g. MobileNet v2 ~400 fps); never designed for language models, no memory to hold LLM weights.",
            "runtime_support": "TensorFlow Lite (Edge TPU compiler) — vision models",
            "power": "~2 W",
            "notable": "Frequently mis-asked-about for LLMs; the answer is no. (Google's newer 'Coralboard' with a transformer-capable NPU is a separate, distinct product.)",
            "source": "https://www.coral.ai/docs/edgetpu/benchmarks/ ; https://www.blackscarab.ai/insights/google-coral-edge-tpu-guide",
            "last_verified": "2026-06"
        },
        {
            "id": "groq-lpu-groqcard",
            "accelerator": "Groq LPU (GroqCard)",
            "manufacturer": "Groq",
            "type": "datacenter inference ASIC (Language Processing Unit)",
            "compute": "Deterministic dataflow architecture",
            "memory_model": "230 MB on-chip SRAM @ ~80 TB/s, NO DRAM — a single chip cannot hold even a small model; ~576 LPUs needed to serve Llama-2-70B",
            "local_llm_suitability": "Not local (cloud/datacenter only) — accessed via GroqCloud token API; rack-scale only. Listed to clarify it is not home/local hardware.",
            "runtime_support": "GroqCloud API (OpenAI-compatible endpoint) — service, not local device",
            "power": "Datacenter card, rack-scale",
            "notable": "~$20k/card and useless in isolation; belongs to the cloud-inference economics story, not local hardware.",
            "source": "https://groq.com/lpu-architecture ; https://cryptoslate.com/groq-20000-lpu-card-breaks-ai-performance-records (230 MB SRAM, ~576 LPUs for 70B)",
            "last_verified": "2026-06"
        },
        {
            "id": "apple-silicon-m-series-pointer",
            "accelerator": "Apple Silicon M-series (M3/M4 Max/Pro) — see ai-gpu-database",
            "manufacturer": "Apple",
            "type": "unified-memory SoC (POINTER ROW — not duplicated here)",
            "compute": "Up to 40-core GPU + 16-core Apple Neural Engine (ANE)",
            "memory_model": "Up to 128 GB unified memory — the reference standard for local LLM on a SoC",
            "local_llm_suitability": "Excellent — but the LLM path is the GPU via Metal/MLX/llama.cpp, NOT the Apple Neural Engine (ANE is used for Core ML vision/system tasks, not general LLM decode). Full specs live in the sibling GPU dataset to avoid duplication.",
            "runtime_support": "MLX, llama.cpp (Metal), Ollama, LM Studio",
            "power": "Laptop/desktop envelope",
            "notable": "CROSS-LINK ONLY — full rows (apple-m4-max-128gb, apple-m3-max-128gb, apple-m4-pro-64gb) already in /data/ai-gpu-database/. Present here purely to disambiguate ANE vs GPU and keep the two datasets explicitly connected.",
            "source": "https://d-central.tech/data/ai-gpu-database/ (internal sibling dataset)",
            "last_verified": "2026-06"
        }
    ]
}