.clang-format .dockerignore .gitignore .pre-commit-config.yaml .readthedocs.yaml .shellcheckrc .yapfignore CMakeLists.txt CODE_OF_CONDUCT.md CONTRIBUTING.md DCO LICENSE MANIFEST.in README.md RELEASE.md SECURITY.md find_cuda_init.py format.sh mkdocs.yaml pyproject.toml setup.py use_existing_torch.py .buildkite/check-wheel-size.py .buildkite/generate_index.py .buildkite/pyproject.toml .buildkite/release-pipeline.yaml .buildkite/test-pipeline.yaml .buildkite/lm-eval-harness/conftest.py .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh .buildkite/lm-eval-harness/test_lm_eval_correctness.py .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml .buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml .buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml .buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml .buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml .buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml .buildkite/lm-eval-harness/configs/models-large.txt .buildkite/lm-eval-harness/configs/models-small.txt .buildkite/nightly-benchmarks/README.md .buildkite/nightly-benchmarks/benchmark-pipeline.yaml .buildkite/nightly-benchmarks/nightly-annotation.md .buildkite/nightly-benchmarks/nightly-descriptions.md .buildkite/nightly-benchmarks/nightly-pipeline.yaml .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md .buildkite/nightly-benchmarks/scripts/compare-json-results.py .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py .buildkite/nightly-benchmarks/scripts/download-tokenizer.py .buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py .buildkite/nightly-benchmarks/scripts/launch-server.sh .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py .buildkite/nightly-benchmarks/scripts/wait-for-image.sh .buildkite/nightly-benchmarks/tests/genai-perf-tests.json .buildkite/nightly-benchmarks/tests/latency-tests-cpu.json .buildkite/nightly-benchmarks/tests/latency-tests.json .buildkite/nightly-benchmarks/tests/nightly-tests.json .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json .buildkite/nightly-benchmarks/tests/serving-tests.json .buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json .buildkite/nightly-benchmarks/tests/throughput-tests.json .buildkite/scripts/annotate-release.sh .buildkite/scripts/ci-clean-log.sh .buildkite/scripts/rerun-test.sh .buildkite/scripts/run-benchmarks.sh .buildkite/scripts/run-multi-node-test.sh .buildkite/scripts/upload-wheels.sh .buildkite/scripts/hardware_ci/run-amd-test.sh .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh .buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh .buildkite/scripts/hardware_ci/run-cpu-test.sh .buildkite/scripts/hardware_ci/run-gh200-test.sh .buildkite/scripts/hardware_ci/run-hpu-test.sh .buildkite/scripts/hardware_ci/run-neuron-test.sh .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh .buildkite/scripts/hardware_ci/run-xpu-test.sh .buildkite/scripts/tpu/cleanup_docker.sh .buildkite/scripts/tpu/config_v6e_1.env .buildkite/scripts/tpu/docker_run_bm.sh .buildkite/scripts/tpu/quantized_v6e_1.env .buildkite/scripts/tpu/run_bm.sh .gemini/config.yaml .github/CODEOWNERS .github/FUNDING.yml .github/PULL_REQUEST_TEMPLATE.md .github/dependabot.yml .github/mergify.yml .github/ISSUE_TEMPLATE/100-documentation.yml .github/ISSUE_TEMPLATE/200-installation.yml .github/ISSUE_TEMPLATE/300-usage.yml .github/ISSUE_TEMPLATE/400-bug-report.yml .github/ISSUE_TEMPLATE/450-ci-failure.yml .github/ISSUE_TEMPLATE/500-feature-request.yml .github/ISSUE_TEMPLATE/600-new-model.yml .github/ISSUE_TEMPLATE/700-performance-discussion.yml .github/ISSUE_TEMPLATE/750-RFC.yml .github/ISSUE_TEMPLATE/config.yml .github/scripts/cleanup_pr_body.sh .github/workflows/add_label_automerge.yml .github/workflows/cleanup_pr_body.yml .github/workflows/lint-and-deploy.yaml .github/workflows/pre-commit.yml .github/workflows/publish.yml .github/workflows/reminder_comment.yml .github/workflows/stale.yml .github/workflows/matchers/actionlint.json .github/workflows/matchers/mypy.json .github/workflows/scripts/build.sh .github/workflows/scripts/create_release.js .github/workflows/scripts/cuda-install.sh .github/workflows/scripts/env.sh .github/workflows/scripts/pytorch-install.sh benchmarks/README.md benchmarks/backend_request_func.py benchmarks/benchmark_dataset.py benchmarks/benchmark_latency.py benchmarks/benchmark_long_document_qa_throughput.py benchmarks/benchmark_prefix_caching.py benchmarks/benchmark_prioritization.py benchmarks/benchmark_serving.py benchmarks/benchmark_serving_structured_output.py benchmarks/benchmark_throughput.py benchmarks/benchmark_utils.py benchmarks/pyproject.toml benchmarks/run_structured_output_benchmark.sh benchmarks/sonnet.txt benchmarks/auto_tune/README.md benchmarks/auto_tune/auto_tune.sh benchmarks/cutlass_benchmarks/sparse_benchmarks.py benchmarks/cutlass_benchmarks/utils.py benchmarks/cutlass_benchmarks/w8a8_benchmarks.py benchmarks/cutlass_benchmarks/weight_shapes.py benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py benchmarks/disagg_benchmarks/round_robin_proxy.py benchmarks/disagg_benchmarks/visualize_benchmark_results.py benchmarks/fused_kernels/layernorm_rms_benchmarks.py benchmarks/kernels/bench_fp8_gemm.py benchmarks/kernels/bench_int8_gemm.py benchmarks/kernels/bench_nvfp4_gemm.py benchmarks/kernels/bench_per_token_quant_fp8.py benchmarks/kernels/benchmark_aqlm.py benchmarks/kernels/benchmark_bitblas.py benchmarks/kernels/benchmark_cutlass_fp4_moe.py benchmarks/kernels/benchmark_grouped_gemm_cutlass.py benchmarks/kernels/benchmark_layernorm.py benchmarks/kernels/benchmark_lora.py benchmarks/kernels/benchmark_machete.py benchmarks/kernels/benchmark_marlin.py benchmarks/kernels/benchmark_moe.py benchmarks/kernels/benchmark_moe_align_block_size.py benchmarks/kernels/benchmark_moe_permute_unpermute.py benchmarks/kernels/benchmark_paged_attention.py benchmarks/kernels/benchmark_quant.py benchmarks/kernels/benchmark_rmsnorm.py benchmarks/kernels/benchmark_rope.py benchmarks/kernels/benchmark_shapes.py benchmarks/kernels/benchmark_trtllm_attention.py benchmarks/kernels/benchmark_w8a8_block_fp8.py benchmarks/kernels/graph_machete_bench.py benchmarks/kernels/requirements.txt benchmarks/kernels/utils.py benchmarks/kernels/weight_shapes.py benchmarks/kernels/deepgemm/README.md benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py benchmarks/kv_cache/benchmark_block_pool.py benchmarks/overheads/benchmark_hashing.py benchmarks/structured_schemas/structured_schema_1.json cmake/cpu_extension.cmake cmake/hipify.py cmake/utils.cmake cmake/external_projects/flashmla.cmake cmake/external_projects/vllm_flash_attn.cmake csrc/activation_kernels.cu csrc/cache.h csrc/cache_kernels.cu csrc/cuda_compat.h csrc/cuda_utils.h csrc/cuda_utils_kernels.cu csrc/cuda_view.cu csrc/cumem_allocator.cpp csrc/custom_all_reduce.cu csrc/custom_all_reduce.cuh csrc/custom_all_reduce_test.cu csrc/custom_quickreduce.cu csrc/dispatch_utils.h csrc/layernorm_kernels.cu csrc/layernorm_quant_kernels.cu csrc/ops.h csrc/permute_cols.cu csrc/pos_encoding_kernels.cu csrc/sampler.cu csrc/torch_bindings.cpp csrc/type_convert.cuh csrc/attention/attention_dtypes.h csrc/attention/attention_generic.cuh csrc/attention/attention_kernels.cuh csrc/attention/attention_utils.cuh csrc/attention/dtype_bfloat16.cuh csrc/attention/dtype_float16.cuh csrc/attention/dtype_float32.cuh csrc/attention/dtype_fp8.cuh csrc/attention/merge_attn_states.cu csrc/attention/paged_attention_v1.cu csrc/attention/paged_attention_v2.cu csrc/attention/vertical_slash_index.cu csrc/attention/mla/cutlass_mla_entry.cu csrc/attention/mla/cutlass_mla_kernels.cu csrc/attention/mla/sm100_cutlass_mla_kernel.cu csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp csrc/core/exception.hpp csrc/core/math.hpp csrc/core/registration.h csrc/core/scalar_type.hpp csrc/cpu/activation.cpp csrc/cpu/attention.cpp csrc/cpu/cache.cpp csrc/cpu/cpu_types.hpp csrc/cpu/cpu_types_arm.hpp csrc/cpu/cpu_types_vsx.hpp csrc/cpu/cpu_types_vxe.hpp csrc/cpu/cpu_types_x86.hpp csrc/cpu/dnnl_helper.hpp csrc/cpu/layernorm.cpp csrc/cpu/mla_decode.cpp csrc/cpu/pos_encoding.cpp csrc/cpu/quant.cpp csrc/cpu/shm.cpp csrc/cpu/torch_bindings.cpp csrc/cpu/utils.cpp csrc/cpu/sgl-kernels/common.h csrc/cpu/sgl-kernels/gemm.cpp csrc/cpu/sgl-kernels/gemm.h csrc/cpu/sgl-kernels/gemm_fp8.cpp csrc/cpu/sgl-kernels/gemm_int8.cpp csrc/cpu/sgl-kernels/moe.cpp csrc/cpu/sgl-kernels/moe_fp8.cpp csrc/cpu/sgl-kernels/moe_int8.cpp csrc/cpu/sgl-kernels/vec.h csrc/cutlass_extensions/common.cpp csrc/cutlass_extensions/common.hpp csrc/cutlass_extensions/cute_utils.cuh csrc/cutlass_extensions/torch_utils.hpp csrc/cutlass_extensions/vllm_collective_builder.cuh csrc/cutlass_extensions/vllm_custom_types.cuh csrc/cutlass_extensions/vllm_cutlass_library_extension.py csrc/cutlass_extensions/vllm_numeric_conversion.cuh csrc/cutlass_extensions/vllm_type_utils.cuh csrc/cutlass_extensions/__pycache__/vllm_cutlass_library_extension.cpython-312.pyc csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp csrc/cutlass_extensions/gemm/dispatch_policy.hpp csrc/cutlass_extensions/gemm/collective/collective_builder.hpp csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp csrc/mamba/mamba_ssm/selective_scan.h csrc/mamba/mamba_ssm/selective_scan_fwd.cu csrc/mamba/mamba_ssm/static_switch.h csrc/moe/moe_align_sum_kernels.cu csrc/moe/moe_ops.h csrc/moe/moe_permute_unpermute_op.cu csrc/moe/moe_wna16.cu csrc/moe/moe_wna16_utils.h csrc/moe/topk_softmax_kernels.cu csrc/moe/torch_bindings.cpp csrc/moe/marlin_moe_wna16/.gitignore csrc/moe/marlin_moe_wna16/generate_kernels.py csrc/moe/marlin_moe_wna16/kernel.h csrc/moe/marlin_moe_wna16/kernel_bf16_kfe2m1f.cu csrc/moe/marlin_moe_wna16/kernel_bf16_kfe4m3fn.cu csrc/moe/marlin_moe_wna16/kernel_bf16_ku4.cu csrc/moe/marlin_moe_wna16/kernel_bf16_ku4b8.cu csrc/moe/marlin_moe_wna16/kernel_bf16_ku8b128.cu csrc/moe/marlin_moe_wna16/kernel_fp16_kfe2m1f.cu csrc/moe/marlin_moe_wna16/kernel_fp16_kfe4m3fn.cu csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cu csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cu csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cu csrc/moe/marlin_moe_wna16/marlin_template.h csrc/moe/marlin_moe_wna16/ops.cu csrc/moe/permute_unpermute_kernels/dispatch.h csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl csrc/prepare_inputs/advance_step.cu csrc/prepare_inputs/advance_step.cuh csrc/quantization/activation_kernels.cu csrc/quantization/utils.cuh csrc/quantization/vectorization.cuh csrc/quantization/vectorization_utils.cuh csrc/quantization/aqlm/gemm_kernels.cu csrc/quantization/awq/dequantize.cuh csrc/quantization/awq/gemm_kernels.cu csrc/quantization/compressed_tensors/int8_quant_kernels.cu csrc/quantization/cutlass_w8a8/Epilogues.md csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu csrc/quantization/cutlass_w8a8/moe/moe_data.cu csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu csrc/quantization/fp4/nvfp4_experts_quant.cu csrc/quantization/fp4/nvfp4_quant_entry.cu csrc/quantization/fp4/nvfp4_quant_kernels.cu csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu csrc/quantization/fp8/common.cu csrc/quantization/fp8/common.cuh csrc/quantization/fp8/per_token_group_quant.cu csrc/quantization/fp8/amd/quant_utils.cuh csrc/quantization/fp8/nvidia/quant_utils.cuh csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu csrc/quantization/fused_kernels/layernorm_utils.cuh csrc/quantization/fused_kernels/quant_conversions.cuh csrc/quantization/gguf/dequantize.cuh csrc/quantization/gguf/ggml-common.h csrc/quantization/gguf/gguf_kernel.cu csrc/quantization/gguf/mmq.cuh csrc/quantization/gguf/mmvq.cuh csrc/quantization/gguf/moe.cuh csrc/quantization/gguf/moe_vec.cuh csrc/quantization/gguf/vecdotq.cuh csrc/quantization/gptq/compat.cuh csrc/quantization/gptq/matrix_view.cuh csrc/quantization/gptq/q_gemm.cu csrc/quantization/gptq/qdq_2.cuh csrc/quantization/gptq/qdq_3.cuh csrc/quantization/gptq/qdq_4.cuh csrc/quantization/gptq/qdq_8.cuh csrc/quantization/gptq/qdq_util.cuh csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu csrc/quantization/gptq_allspark/allspark_repack.cu csrc/quantization/gptq_allspark/allspark_utils.cuh csrc/quantization/gptq_marlin/.gitignore csrc/quantization/gptq_marlin/awq_marlin_repack.cu csrc/quantization/gptq_marlin/dequant.h csrc/quantization/gptq_marlin/generate_kernels.py csrc/quantization/gptq_marlin/gptq_marlin.cu csrc/quantization/gptq_marlin/gptq_marlin_repack.cu csrc/quantization/gptq_marlin/kernel.h csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu csrc/quantization/gptq_marlin/marlin.cuh csrc/quantization/gptq_marlin/marlin_dtypes.cuh csrc/quantization/gptq_marlin/marlin_template.h csrc/quantization/machete/Readme.md csrc/quantization/machete/generate.py csrc/quantization/machete/machete_collective_builder.cuh csrc/quantization/machete/machete_interleaving_utils.cuh csrc/quantization/machete/machete_mainloop.cuh csrc/quantization/machete/machete_mm_kernel.cuh csrc/quantization/machete/machete_mm_launcher.cuh csrc/quantization/machete/machete_prepack_kernel.cuh csrc/quantization/machete/machete_prepack_launcher.cuh csrc/quantization/machete/machete_prepacked_layout.cuh csrc/quantization/machete/machete_pytorch.cu csrc/quantization/machete/generated/machete_mm_dispatch.cu csrc/quantization/machete/generated/machete_mm_impl_part1.cu csrc/quantization/machete/generated/machete_mm_impl_part2.cu csrc/quantization/machete/generated/machete_mm_impl_part3.cu csrc/quantization/machete/generated/machete_mm_impl_part4.cu csrc/quantization/machete/generated/machete_mm_impl_part5.cu csrc/quantization/machete/generated/machete_mm_impl_part6.cu csrc/quantization/machete/generated/machete_mm_impl_part7.cu csrc/quantization/machete/generated/machete_mm_impl_part8.cu csrc/quantization/machete/generated/machete_prepack.cu csrc/quantization/marlin/dense/LICENSE csrc/quantization/marlin/dense/marlin_cuda_kernel.cu csrc/quantization/marlin/dense/common/base.h csrc/quantization/marlin/dense/common/mem.h csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu csrc/quantization/marlin/sparse/LICENSE csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu csrc/quantization/marlin/sparse/common/base.h csrc/quantization/marlin/sparse/common/mem.h csrc/quantization/marlin/sparse/common/mma.h csrc/quickreduce/base.h csrc/quickreduce/quick_reduce.h csrc/quickreduce/quick_reduce_impl.cuh csrc/rocm/attention.cu csrc/rocm/ops.h csrc/rocm/skinny_gemms.cu csrc/rocm/torch_bindings.cpp csrc/sparse/cutlass/sparse_compressor_c3x.cuh csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh csrc/sparse/cutlass/sparse_scaled_mm_entry.cu docker/Dockerfile docker/Dockerfile.arm docker/Dockerfile.cpu docker/Dockerfile.neuron docker/Dockerfile.nightly_torch docker/Dockerfile.ppc64le docker/Dockerfile.rocm docker/Dockerfile.rocm_base docker/Dockerfile.s390x docker/Dockerfile.tpu docker/Dockerfile.xpu docs/.nav.yml docs/README.md docs/api/README.md docs/api/vllm/.meta.yml docs/assets/contributing/dockerfile-stages-dependency.png docs/assets/deployment/anything-llm-chat-with-doc.png docs/assets/deployment/anything-llm-chat-without-doc.png docs/assets/deployment/anything-llm-provider.png docs/assets/deployment/anything-llm-upload-doc.png docs/assets/deployment/architecture_helm_deployment.png docs/assets/deployment/chatbox-chat.png docs/assets/deployment/chatbox-settings.png docs/assets/deployment/dify-chat.png docs/assets/deployment/dify-create-chatbot.png docs/assets/deployment/dify-settings.png docs/assets/deployment/dp_external_lb.png docs/assets/deployment/dp_internal_lb.png docs/assets/deployment/open_webui.png docs/assets/deployment/streamlit-chat.png docs/assets/design/hierarchy.png docs/assets/design/arch_overview/entrypoints.excalidraw.png docs/assets/design/arch_overview/llm_engine.excalidraw.png docs/assets/design/v1/metrics/intervals-1.png docs/assets/design/v1/metrics/intervals-2.png docs/assets/design/v1/metrics/intervals-3.png docs/assets/design/v1/prefix_caching/example-time-1.png docs/assets/design/v1/prefix_caching/example-time-3.png docs/assets/design/v1/prefix_caching/example-time-4.png docs/assets/design/v1/prefix_caching/example-time-5.png docs/assets/design/v1/prefix_caching/example-time-6.png docs/assets/design/v1/prefix_caching/example-time-7.png docs/assets/design/v1/prefix_caching/free.png docs/assets/design/v1/prefix_caching/overview.png docs/assets/features/disagg_prefill/abstraction.jpg docs/assets/features/disagg_prefill/overview.jpg docs/assets/kernel/k_vecs.png docs/assets/kernel/key.png docs/assets/kernel/logits_vec.png docs/assets/kernel/q_vecs.png docs/assets/kernel/query.png docs/assets/kernel/v_vec.png docs/assets/kernel/value.png docs/assets/logos/vllm-logo-only-light.ico docs/assets/logos/vllm-logo-only-light.png docs/assets/logos/vllm-logo-text-dark.png docs/assets/logos/vllm-logo-text-light.png docs/cli/README.md docs/community/contact_us.md docs/community/meetups.md docs/community/sponsors.md docs/configuration/README.md docs/configuration/conserving_memory.md docs/configuration/engine_args.md docs/configuration/env_vars.md docs/configuration/model_resolution.md docs/configuration/optimization.md docs/configuration/serve_args.md docs/contributing/README.md docs/contributing/benchmarks.md docs/contributing/deprecation_policy.md docs/contributing/incremental_build.md docs/contributing/profiling.md docs/contributing/vulnerability_management.md docs/contributing/ci/failures.md docs/contributing/ci/update_pytorch_version.md docs/contributing/dockerfile/dockerfile.md docs/contributing/model/README.md docs/contributing/model/basic.md docs/contributing/model/multimodal.md docs/contributing/model/registration.md docs/contributing/model/tests.md docs/deployment/docker.md docs/deployment/k8s.md docs/deployment/nginx.md docs/deployment/frameworks/anyscale.md docs/deployment/frameworks/anything-llm.md docs/deployment/frameworks/autogen.md docs/deployment/frameworks/bentoml.md docs/deployment/frameworks/cerebrium.md docs/deployment/frameworks/chatbox.md docs/deployment/frameworks/dify.md docs/deployment/frameworks/dstack.md docs/deployment/frameworks/haystack.md docs/deployment/frameworks/helm.md docs/deployment/frameworks/litellm.md docs/deployment/frameworks/lobe-chat.md docs/deployment/frameworks/lws.md docs/deployment/frameworks/modal.md docs/deployment/frameworks/open-webui.md docs/deployment/frameworks/retrieval_augmented_generation.md docs/deployment/frameworks/skypilot.md docs/deployment/frameworks/streamlit.md docs/deployment/frameworks/triton.md docs/deployment/integrations/kserve.md docs/deployment/integrations/kubeai.md docs/deployment/integrations/kuberay.md docs/deployment/integrations/llamastack.md docs/deployment/integrations/llmaz.md docs/deployment/integrations/production-stack.md docs/design/arch_overview.md docs/design/automatic_prefix_caching.md docs/design/huggingface_integration.md docs/design/mm_processing.md docs/design/plugin_system.md docs/design/kernel/paged_attention.md docs/design/v1/metrics.md docs/design/v1/multiprocessing.md docs/design/v1/p2p_nccl_connector.md docs/design/v1/prefix_caching.md docs/design/v1/torch_compile.md docs/features/automatic_prefix_caching.md docs/features/compatibility_matrix.md docs/features/disagg_prefill.md docs/features/lora.md docs/features/multimodal_inputs.md docs/features/prompt_embeds.md docs/features/reasoning_outputs.md docs/features/spec_decode.md docs/features/structured_outputs.md docs/features/tool_calling.md docs/features/quantization/README.md docs/features/quantization/auto_awq.md docs/features/quantization/bitblas.md docs/features/quantization/bnb.md docs/features/quantization/fp8.md docs/features/quantization/gguf.md docs/features/quantization/gptqmodel.md docs/features/quantization/inc.md docs/features/quantization/int4.md docs/features/quantization/int8.md docs/features/quantization/modelopt.md docs/features/quantization/quantized_kvcache.md docs/features/quantization/quark.md docs/features/quantization/supported_hardware.md docs/features/quantization/torchao.md docs/getting_started/quickstart.md docs/getting_started/installation/.nav.yml docs/getting_started/installation/README.md docs/getting_started/installation/aws_neuron.md docs/getting_started/installation/cpu.md docs/getting_started/installation/device.template.md docs/getting_started/installation/google_tpu.md docs/getting_started/installation/gpu.md docs/getting_started/installation/intel_gaudi.md docs/getting_started/installation/python_env_setup.inc.md docs/getting_started/installation/cpu/apple.inc.md docs/getting_started/installation/cpu/arm.inc.md docs/getting_started/installation/cpu/build.inc.md docs/getting_started/installation/cpu/s390x.inc.md docs/getting_started/installation/cpu/x86.inc.md docs/getting_started/installation/gpu/cuda.inc.md docs/getting_started/installation/gpu/rocm.inc.md docs/getting_started/installation/gpu/xpu.inc.md docs/mkdocs/hooks/generate_argparse.py docs/mkdocs/hooks/generate_examples.py docs/mkdocs/hooks/remove_announcement.py docs/mkdocs/hooks/url_schemes.py docs/mkdocs/javascript/edit_and_feedback.js docs/mkdocs/javascript/run_llm_widget.js docs/mkdocs/javascript/slack_and_forum.js docs/mkdocs/overrides/main.html docs/mkdocs/overrides/partials/toc-item.html docs/mkdocs/stylesheets/extra.css docs/models/generative_models.md docs/models/pooling_models.md docs/models/supported_models.md docs/models/extensions/fastsafetensor.md docs/models/extensions/runai_model_streamer.md docs/models/extensions/tensorizer.md docs/models/hardware_supported_models/tpu.md docs/serving/data_parallel_deployment.md docs/serving/distributed_serving.md docs/serving/offline_inference.md docs/serving/openai_compatible_server.md docs/serving/integrations/langchain.md docs/serving/integrations/llamaindex.md docs/training/rlhf.md docs/training/trl.md docs/usage/README.md docs/usage/faq.md docs/usage/metrics.md docs/usage/reproducibility.md docs/usage/security.md docs/usage/troubleshooting.md docs/usage/usage_stats.md docs/usage/v1_guide.md examples/pyproject.toml examples/template_alpaca.jinja examples/template_baichuan.jinja examples/template_chatglm.jinja examples/template_chatglm2.jinja examples/template_chatml.jinja examples/template_dse_qwen2_vl.jinja examples/template_falcon.jinja examples/template_falcon_180b.jinja examples/template_inkbot.jinja examples/template_teleflm.jinja examples/template_vlm2vec.jinja examples/tool_chat_template_deepseekr1.jinja examples/tool_chat_template_deepseekv3.jinja examples/tool_chat_template_granite.jinja examples/tool_chat_template_granite_20b_fc.jinja examples/tool_chat_template_hermes.jinja examples/tool_chat_template_hunyuan_a13b.jinja examples/tool_chat_template_internlm2_tool.jinja examples/tool_chat_template_llama3.1_json.jinja examples/tool_chat_template_llama3.2_json.jinja examples/tool_chat_template_llama3.2_pythonic.jinja examples/tool_chat_template_llama4_json.jinja examples/tool_chat_template_llama4_pythonic.jinja examples/tool_chat_template_minimax_m1.jinja examples/tool_chat_template_mistral.jinja examples/tool_chat_template_mistral3.jinja examples/tool_chat_template_mistral_parallel.jinja examples/tool_chat_template_phi4_mini.jinja examples/tool_chat_template_toolace.jinja examples/tool_chat_template_xlam_llama.jinja examples/tool_chat_template_xlam_qwen.jinja examples/offline_inference/audio_language.py examples/offline_inference/automatic_prefix_caching.py examples/offline_inference/batch_llm_inference.py examples/offline_inference/chat_with_tools.py examples/offline_inference/context_extension.py examples/offline_inference/convert_model_to_seq_cls.py examples/offline_inference/data_parallel.py examples/offline_inference/disaggregated_prefill.py examples/offline_inference/embed_jina_embeddings_v3.py examples/offline_inference/embed_matryoshka_fy.py examples/offline_inference/encoder_decoder.py examples/offline_inference/encoder_decoder_multimodal.py examples/offline_inference/llm_engine_example.py examples/offline_inference/load_sharded_state.py examples/offline_inference/lora_with_quantization_inference.py examples/offline_inference/metrics.py examples/offline_inference/mistral-small.py examples/offline_inference/mlpspeculator.py examples/offline_inference/multilora_inference.py examples/offline_inference/neuron.py examples/offline_inference/neuron_eagle.py examples/offline_inference/neuron_int8_quantization.py examples/offline_inference/neuron_multimodal.py examples/offline_inference/neuron_speculation.py examples/offline_inference/prefix_caching.py examples/offline_inference/prithvi_geospatial_mae.py examples/offline_inference/profiling.py examples/offline_inference/prompt_embed_inference.py examples/offline_inference/qwen3_reranker.py examples/offline_inference/qwen_1m.py examples/offline_inference/reproducibility.py examples/offline_inference/rlhf.py examples/offline_inference/rlhf_colocate.py examples/offline_inference/rlhf_utils.py examples/offline_inference/save_sharded_state.py examples/offline_inference/simple_profiling.py examples/offline_inference/skip_loading_weights_in_engine_init.py examples/offline_inference/spec_decode.py examples/offline_inference/structured_outputs.py examples/offline_inference/torchrun_example.py examples/offline_inference/tpu.py examples/offline_inference/vision_language.py examples/offline_inference/vision_language_multi_image.py examples/offline_inference/vision_language_pooling.py examples/offline_inference/basic/README.md examples/offline_inference/basic/basic.py examples/offline_inference/basic/chat.py examples/offline_inference/basic/classify.py examples/offline_inference/basic/embed.py examples/offline_inference/basic/generate.py examples/offline_inference/basic/score.py examples/offline_inference/disaggregated-prefill-v1/README.md examples/offline_inference/disaggregated-prefill-v1/decode_example.py examples/offline_inference/disaggregated-prefill-v1/prefill_example.py examples/offline_inference/disaggregated-prefill-v1/run.sh examples/offline_inference/openai_batch/README.md examples/offline_inference/openai_batch/openai_example_batch.jsonl examples/offline_inference/profiling_tpu/README.md examples/offline_inference/profiling_tpu/profiling.py examples/offline_inference/qwen2_5_omni/README.md examples/offline_inference/qwen2_5_omni/only_thinker.py examples/online_serving/api_client.py examples/online_serving/cohere_rerank_client.py examples/online_serving/disaggregated_prefill.sh examples/online_serving/gradio_openai_chatbot_webserver.py examples/online_serving/gradio_webserver.py examples/online_serving/jinaai_rerank_client.py examples/online_serving/kv_events_subscriber.py examples/online_serving/multi-node-serving.sh examples/online_serving/multi_instance_data_parallel.py examples/online_serving/openai_chat_completion_client.py examples/online_serving/openai_chat_completion_client_for_multimodal.py examples/online_serving/openai_chat_completion_client_with_tools.py examples/online_serving/openai_chat_completion_client_with_tools_required.py examples/online_serving/openai_chat_completion_client_with_tools_xlam.py examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py examples/online_serving/openai_chat_completion_with_reasoning.py examples/online_serving/openai_chat_completion_with_reasoning_streaming.py examples/online_serving/openai_chat_embedding_client_for_multimodal.py examples/online_serving/openai_classification_client.py examples/online_serving/openai_completion_client.py examples/online_serving/openai_cross_encoder_score.py examples/online_serving/openai_cross_encoder_score_for_multimodal.py examples/online_serving/openai_embedding_client.py examples/online_serving/openai_embedding_matryoshka_fy.py examples/online_serving/openai_pooling_client.py examples/online_serving/openai_transcription_client.py examples/online_serving/openai_translation_client.py examples/online_serving/prompt_embed_inference_with_openai_client.py examples/online_serving/ray_serve_deepseek.py examples/online_serving/retrieval_augmented_generation_with_langchain.py examples/online_serving/retrieval_augmented_generation_with_llamaindex.py examples/online_serving/run_cluster.sh examples/online_serving/sagemaker-entrypoint.sh examples/online_serving/streamlit_openai_chatbot_webserver.py examples/online_serving/utils.py examples/online_serving/chart-helm/.helmignore examples/online_serving/chart-helm/Chart.yaml examples/online_serving/chart-helm/README.md examples/online_serving/chart-helm/ct.yaml examples/online_serving/chart-helm/lintconf.yaml examples/online_serving/chart-helm/values.schema.json examples/online_serving/chart-helm/values.yaml examples/online_serving/chart-helm/templates/_helpers.tpl examples/online_serving/chart-helm/templates/configmap.yaml examples/online_serving/chart-helm/templates/custom-objects.yaml examples/online_serving/chart-helm/templates/deployment.yaml examples/online_serving/chart-helm/templates/hpa.yaml examples/online_serving/chart-helm/templates/job.yaml examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml examples/online_serving/chart-helm/templates/pvc.yaml examples/online_serving/chart-helm/templates/secrets.yaml examples/online_serving/chart-helm/templates/service.yaml examples/online_serving/disaggregated_serving/README.md examples/online_serving/disaggregated_serving/disagg_proxy_demo.py examples/online_serving/disaggregated_serving/kv_events.sh examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py examples/online_serving/elastic_ep/bench.sh examples/online_serving/elastic_ep/scale.py examples/online_serving/elastic_ep/serve_deepseek_v2.sh examples/online_serving/opentelemetry/README.md examples/online_serving/opentelemetry/dummy_client.py examples/online_serving/prometheus_grafana/README.md examples/online_serving/prometheus_grafana/docker-compose.yaml examples/online_serving/prometheus_grafana/grafana.json examples/online_serving/prometheus_grafana/prometheus.yaml examples/online_serving/structured_outputs/README.md examples/online_serving/structured_outputs/pyproject.toml examples/online_serving/structured_outputs/structured_outputs.py examples/others/logging_configuration.md examples/others/tensorize_vllm_model.py examples/others/lmcache/README.md examples/others/lmcache/cpu_offload_lmcache.py examples/others/lmcache/disagg_prefill_lmcache_v0.py examples/others/lmcache/kv_cache_sharing_lmcache_v1.py examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml requirements/build.txt requirements/common.txt requirements/cpu-build.txt requirements/cpu.txt requirements/cuda.txt requirements/dev.txt requirements/docs.txt requirements/kv_connectors.txt requirements/lint.txt requirements/neuron.txt requirements/nightly_torch_test.txt requirements/rocm-build.txt requirements/rocm-test.txt requirements/rocm.txt requirements/test.in requirements/test.txt requirements/tpu.txt requirements/xpu.txt tests/__init__.py tests/build_cython.py tests/conftest.py tests/test_cache_block_hashing.py tests/test_config.py tests/test_embedded_commit.py tests/test_inputs.py tests/test_logger.py tests/test_outputs.py tests/test_regression.py tests/test_sampling_params.py tests/test_scalartype.py tests/test_seed_behavior.py tests/test_sequence.py tests/test_sharded_state_loader.py tests/test_triton_utils.py tests/test_utils.py tests/test_version.py tests/test_vllm_port.py tests/utils.py tests/async_engine/__init__.py tests/async_engine/api_server_async_engine.py tests/async_engine/conftest.py tests/async_engine/test_api_server.py tests/async_engine/test_async_llm_engine.py tests/async_engine/test_request_tracker.py tests/basic_correctness/__init__.py tests/basic_correctness/test_basic_correctness.py tests/basic_correctness/test_chunked_prefill.py tests/basic_correctness/test_cpu_offload.py tests/basic_correctness/test_cumem.py tests/basic_correctness/test_preemption.py tests/benchmarks/__init__.py tests/benchmarks/test_latency_cli.py tests/benchmarks/test_serve_cli.py tests/benchmarks/test_throughput_cli.py tests/compile/__init__.py tests/compile/backend.py tests/compile/test_async_tp.py tests/compile/test_basic_correctness.py tests/compile/test_config.py tests/compile/test_full_graph.py tests/compile/test_functionalization.py tests/compile/test_fusion.py tests/compile/test_fusion_all_reduce.py tests/compile/test_fusion_attn.py tests/compile/test_pass_manager.py tests/compile/test_sequence_parallelism.py tests/compile/test_silu_mul_quant_fusion.py tests/compile/test_wrapper.py tests/compile/piecewise/__init__.py tests/compile/piecewise/test_full_cudagraph.py tests/compile/piecewise/test_multiple_graphs.py tests/compile/piecewise/test_simple.py tests/compile/piecewise/test_toy_llama.py tests/config/test_config.yaml tests/config/test_config_generation.py tests/config/test_config_with_model.yaml tests/config/test_mp_reducer.py tests/core/__init__.py tests/core/conftest.py tests/core/test_chunked_prefill_scheduler.py tests/core/test_num_computed_tokens_update.py tests/core/test_scheduler.py tests/core/test_scheduler_encoder_decoder.py tests/core/test_serialization.py tests/core/utils.py tests/core/block/__init__.py tests/core/block/conftest.py tests/core/block/test_block_manager.py tests/core/block/test_block_table.py tests/core/block/test_common.py tests/core/block/test_cpu_gpu_block_allocator.py tests/core/block/test_naive_block.py tests/core/block/test_prefix_caching_block.py tests/core/block/e2e/__init__.py tests/core/block/e2e/conftest.py tests/core/block/e2e/test_correctness.py tests/core/block/e2e/test_correctness_sliding_window.py tests/cuda/test_cuda_context.py tests/detokenizer/__init__.py tests/detokenizer/conftest.py tests/detokenizer/test_disable_detokenization.py tests/detokenizer/test_stop_checker.py tests/detokenizer/test_stop_reason.py tests/detokenizer/test_stop_strings.py tests/distributed/__init__.py tests/distributed/conftest.py tests/distributed/test_ca_buffer_sharing.py tests/distributed/test_comm_ops.py tests/distributed/test_custom_all_reduce.py tests/distributed/test_distributed_oot.py tests/distributed/test_eplb_algo.py tests/distributed/test_eplb_execute.py tests/distributed/test_events.py tests/distributed/test_expert_parallel.py tests/distributed/test_multi_node_assignment.py tests/distributed/test_node_count.py tests/distributed/test_pipeline_parallel.py tests/distributed/test_pipeline_partition.py tests/distributed/test_pp_cudagraph.py tests/distributed/test_pynccl.py tests/distributed/test_quick_all_reduce.py tests/distributed/test_same_node.py tests/distributed/test_sequence_parallel.py tests/distributed/test_shm_broadcast.py tests/distributed/test_torchrun_example.py tests/distributed/test_utils.py tests/encoder_decoder/__init__.py tests/encoder_decoder/test_e2e_correctness.py tests/engine/__init__.py tests/engine/conftest.py tests/engine/test_arg_utils.py tests/engine/test_computed_prefix_blocks.py tests/engine/test_executor.py tests/engine/test_multi_step_output_processor.py tests/engine/test_multiproc_workers.py tests/engine/test_options.py tests/engine/test_short_mm_context.py tests/entrypoints/__init__.py tests/entrypoints/conftest.py tests/entrypoints/test_api_server_process_manager.py tests/entrypoints/test_chat_utils.py tests/entrypoints/test_ssl_cert_refresher.py tests/entrypoints/llm/__init__.py tests/entrypoints/llm/test_accuracy.py tests/entrypoints/llm/test_chat.py tests/entrypoints/llm/test_collective_rpc.py tests/entrypoints/llm/test_encode.py tests/entrypoints/llm/test_generate.py tests/entrypoints/llm/test_generate_multiple_loras.py tests/entrypoints/llm/test_gpu_utilization.py tests/entrypoints/llm/test_guided_generate.py tests/entrypoints/llm/test_lazy_outlines.py tests/entrypoints/llm/test_prompt_validation.py tests/entrypoints/offline_mode/__init__.py tests/entrypoints/offline_mode/test_offline_mode.py tests/entrypoints/openai/__init__.py tests/entrypoints/openai/test_async_tokenization.py tests/entrypoints/openai/test_audio.py tests/entrypoints/openai/test_basic.py tests/entrypoints/openai/test_chat.py tests/entrypoints/openai/test_chat_echo.py tests/entrypoints/openai/test_chat_logit_bias_validation.py tests/entrypoints/openai/test_chat_template.py tests/entrypoints/openai/test_chat_with_tool_reasoning.py tests/entrypoints/openai/test_chunked_prompt.py tests/entrypoints/openai/test_classification.py tests/entrypoints/openai/test_cli_args.py tests/entrypoints/openai/test_completion.py tests/entrypoints/openai/test_completion_with_function_calling.py tests/entrypoints/openai/test_completion_with_prompt_embeds.py tests/entrypoints/openai/test_default_mm_loras.py tests/entrypoints/openai/test_embedding.py tests/entrypoints/openai/test_embedding_dimensions.py tests/entrypoints/openai/test_encoder_decoder.py tests/entrypoints/openai/test_lora_adapters.py tests/entrypoints/openai/test_lora_resolvers.py tests/entrypoints/openai/test_metrics.py tests/entrypoints/openai/test_models.py tests/entrypoints/openai/test_oot_registration.py tests/entrypoints/openai/test_openai_schema.py tests/entrypoints/openai/test_optional_middleware.py tests/entrypoints/openai/test_pooling.py tests/entrypoints/openai/test_prompt_validation.py tests/entrypoints/openai/test_rerank.py tests/entrypoints/openai/test_return_tokens_as_ids.py tests/entrypoints/openai/test_root_path.py tests/entrypoints/openai/test_run_batch.py tests/entrypoints/openai/test_score.py tests/entrypoints/openai/test_serving_chat.py tests/entrypoints/openai/test_serving_models.py tests/entrypoints/openai/test_shutdown.py tests/entrypoints/openai/test_sleep.py tests/entrypoints/openai/test_tensorizer_entrypoint.py tests/entrypoints/openai/test_tokenization.py tests/entrypoints/openai/test_transcription_validation.py tests/entrypoints/openai/test_translation_validation.py tests/entrypoints/openai/test_truncation.py tests/entrypoints/openai/test_video.py tests/entrypoints/openai/test_vision.py tests/entrypoints/openai/test_vision_embedding.py tests/entrypoints/openai/correctness/__init__.py tests/entrypoints/openai/correctness/test_lmeval.py tests/entrypoints/openai/correctness/test_mteb_embed.py tests/entrypoints/openai/correctness/test_mteb_score.py tests/entrypoints/openai/correctness/test_transcription_api_correctness.py tests/entrypoints/openai/tool_parsers/__init__.py tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py tests/entrypoints/openai/tool_parsers/utils.py tests/fastsafetensors_loader/__init__.py tests/fastsafetensors_loader/test_fastsafetensors_loader.py tests/fastsafetensors_loader/test_weight_utils.py tests/kernels/__init__.py tests/kernels/allclose_default.py tests/kernels/quant_utils.py tests/kernels/test_apply_repetition_penalties.py tests/kernels/test_cutlass_mla_decode.py tests/kernels/test_flex_attention.py tests/kernels/test_fused_quant_activation.py tests/kernels/test_triton_flash_attention.py tests/kernels/utils.py tests/kernels/attention/conftest.py tests/kernels/attention/test_attention.py tests/kernels/attention/test_attention_selector.py tests/kernels/attention/test_cache.py tests/kernels/attention/test_cascade_flash_attn.py tests/kernels/attention/test_encoder_decoder_attn.py tests/kernels/attention/test_flash_attn.py tests/kernels/attention/test_flashinfer.py tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py tests/kernels/attention/test_flashmla.py tests/kernels/attention/test_lightning_attn.py tests/kernels/attention/test_merge_attn_states.py tests/kernels/attention/test_mha_attn.py tests/kernels/attention/test_mla_decode_cpu.py tests/kernels/attention/test_prefix_prefill.py tests/kernels/attention/test_rocm_attention_selector.py tests/kernels/attention/test_triton_decode_attention.py tests/kernels/attention/test_triton_unified_attention.py tests/kernels/core/test_activation.py tests/kernels/core/test_fused_quant_layernorm.py tests/kernels/core/test_layernorm.py tests/kernels/core/test_opcheck.py tests/kernels/core/test_permute_cols.py tests/kernels/core/test_pos_encoding.py tests/kernels/core/test_rotary_embedding.py tests/kernels/core/test_uva.py tests/kernels/mamba/test_causal_conv1d.py tests/kernels/mamba/test_mamba_mixer2.py tests/kernels/mamba/test_mamba_ssm.py tests/kernels/mamba/test_mamba_ssm_ssd.py tests/kernels/moe/__init__.py tests/kernels/moe/parallel_utils.py tests/kernels/moe/test_batched_moe.py tests/kernels/moe/test_block_fp8.py tests/kernels/moe/test_block_int8.py tests/kernels/moe/test_count_expert_num_tokens.py tests/kernels/moe/test_cutlass_grouped_gemm.py tests/kernels/moe/test_cutlass_moe.py tests/kernels/moe/test_deepep_deepgemm_moe.py tests/kernels/moe/test_deepep_moe.py tests/kernels/moe/test_deepgemm.py tests/kernels/moe/test_modular_kernel_combinations.py tests/kernels/moe/test_moe.py tests/kernels/moe/test_moe_align_block_size.py tests/kernels/moe/test_moe_permute_unpermute.py tests/kernels/moe/test_mxfp4_moe.py tests/kernels/moe/test_nvfp4_moe.py tests/kernels/moe/test_pplx_cutlass_moe.py tests/kernels/moe/test_pplx_moe.py tests/kernels/moe/test_rocm_aiter_topk.py tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py tests/kernels/moe/test_triton_moe_ptpc_fp8.py tests/kernels/moe/utils.py tests/kernels/moe/modular_kernel_tools/__init__.py tests/kernels/moe/modular_kernel_tools/cli_args.py tests/kernels/moe/modular_kernel_tools/common.py tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py tests/kernels/moe/modular_kernel_tools/mk_objects.py tests/kernels/moe/modular_kernel_tools/parallel_utils.py tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py tests/kernels/moe/modular_kernel_tools/utils.py tests/kernels/quantization/nvfp4_utils.py tests/kernels/quantization/test_allspark_gemm.py tests/kernels/quantization/test_aqlm.py tests/kernels/quantization/test_awq.py tests/kernels/quantization/test_awq_triton.py tests/kernels/quantization/test_block_fp8.py tests/kernels/quantization/test_block_int8.py tests/kernels/quantization/test_cutlass_2of4_sparse.py tests/kernels/quantization/test_cutlass_scaled_mm.py tests/kernels/quantization/test_fp8_quant.py tests/kernels/quantization/test_ggml.py tests/kernels/quantization/test_gguf.py tests/kernels/quantization/test_gptq.py tests/kernels/quantization/test_int8_kernel.py tests/kernels/quantization/test_int8_quant.py tests/kernels/quantization/test_machete_mm.py tests/kernels/quantization/test_marlin_gemm.py tests/kernels/quantization/test_nvfp4_quant.py tests/kernels/quantization/test_nvfp4_scaled_mm.py tests/kernels/quantization/test_per_token_group_quant.py tests/kernels/quantization/test_rocm_skinny_gemms.py tests/kernels/quantization/test_triton_scaled_mm.py tests/kv_transfer/test_disagg.py tests/kv_transfer/test_lookup_buffer.py tests/kv_transfer/test_lookup_buffer.sh tests/kv_transfer/test_module.py tests/kv_transfer/test_send_recv.py tests/kv_transfer/test_send_recv.sh tests/lora/__init__.py tests/lora/conftest.py tests/lora/test_add_lora.py tests/lora/test_baichuan.py tests/lora/test_chatglm3_tp.py tests/lora/test_default_mm_loras.py tests/lora/test_layers.py tests/lora/test_llama_tp.py tests/lora/test_lora_allowed_token_ids.py tests/lora/test_lora_checkpoints.py tests/lora/test_lora_functions.py tests/lora/test_lora_huggingface.py tests/lora/test_lora_manager.py tests/lora/test_minicpmv_tp.py tests/lora/test_mixtral.py tests/lora/test_peft_helper.py tests/lora/test_phi.py tests/lora/test_punica_ops.py tests/lora/test_quant_model.py tests/lora/test_qwen2vl.py tests/lora/test_resolver.py tests/lora/test_tokenizer_group.py tests/lora/test_transformers_model.py tests/lora/test_utils.py tests/lora/test_worker.py tests/lora/utils.py tests/metrics/__init__.py tests/metrics/test_metrics.py tests/mistral_tool_use/__init__.py tests/mistral_tool_use/conftest.py tests/mistral_tool_use/test_mistral_tool_calls.py tests/mistral_tool_use/utils.py tests/model_executor/__init__.py tests/model_executor/conftest.py tests/model_executor/test_enabled_custom_ops.py tests/model_executor/test_guided_processors.py tests/model_executor/test_logits_processor.py tests/model_executor/test_model_load_with_params.py tests/model_executor/test_weight_utils.py tests/models/__init__.py tests/models/registry.py tests/models/test_initialization.py tests/models/test_oot_registration.py tests/models/test_registry.py tests/models/test_transformers.py tests/models/test_utils.py tests/models/test_vision.py tests/models/utils.py tests/models/fixtures/mistral_small_3_chat.json tests/models/fixtures/pixtral_chat.json tests/models/language/__init__.py tests/models/language/generation/__init__.py tests/models/language/generation/test_bart.py tests/models/language/generation/test_common.py tests/models/language/generation/test_gemma.py tests/models/language/generation/test_granite.py tests/models/language/generation/test_hybrid.py tests/models/language/generation/test_mistral.py tests/models/language/generation/test_phimoe.py tests/models/language/pooling/__init__.py tests/models/language/pooling/embed_utils.py tests/models/language/pooling/mteb_utils.py tests/models/language/pooling/test_baai.py tests/models/language/pooling/test_bge_reranker_v2_gemma.py tests/models/language/pooling/test_classification.py tests/models/language/pooling/test_cross_encoder.py tests/models/language/pooling/test_embedding.py tests/models/language/pooling/test_gritlm.py tests/models/language/pooling/test_gte.py tests/models/language/pooling/test_intfloat.py tests/models/language/pooling/test_jina.py tests/models/language/pooling/test_mxbai_rerank.py tests/models/language/pooling/test_nomic.py tests/models/language/pooling/test_nomic_max_model_len.py tests/models/language/pooling/test_qwen3_reranker.py tests/models/language/pooling/test_reward.py tests/models/language/pooling/test_scoring.py tests/models/language/pooling/test_snowflake_arctic_embed.py tests/models/language/pooling/test_truncation_control.py tests/models/multimodal/__init__.py tests/models/multimodal/test_mapping.py tests/models/multimodal/generation/__init__.py tests/models/multimodal/generation/test_common.py tests/models/multimodal/generation/test_florence2.py tests/models/multimodal/generation/test_granite_speech.py tests/models/multimodal/generation/test_interleaved.py tests/models/multimodal/generation/test_maverick.py tests/models/multimodal/generation/test_mllama.py tests/models/multimodal/generation/test_phi4mm.py tests/models/multimodal/generation/test_pixtral.py tests/models/multimodal/generation/test_qwen2_vl.py tests/models/multimodal/generation/test_ultravox.py tests/models/multimodal/generation/test_voxtral.py tests/models/multimodal/generation/test_whisper.py tests/models/multimodal/generation/vlm_utils/__init__.py tests/models/multimodal/generation/vlm_utils/builders.py tests/models/multimodal/generation/vlm_utils/case_filtering.py tests/models/multimodal/generation/vlm_utils/core.py tests/models/multimodal/generation/vlm_utils/custom_inputs.py tests/models/multimodal/generation/vlm_utils/model_utils.py tests/models/multimodal/generation/vlm_utils/runners.py tests/models/multimodal/generation/vlm_utils/types.py tests/models/multimodal/pooling/__init__.py tests/models/multimodal/pooling/test_dse_qwen2_vl.py tests/models/multimodal/pooling/test_intern_vit.py tests/models/multimodal/pooling/test_jinavl_reranker.py tests/models/multimodal/pooling/test_llava_next.py tests/models/multimodal/pooling/test_phi3v.py tests/models/multimodal/pooling/test_prithvi_mae.py tests/models/multimodal/processing/__init__.py tests/models/multimodal/processing/test_common.py tests/models/multimodal/processing/test_h2ovl.py tests/models/multimodal/processing/test_idefics3.py tests/models/multimodal/processing/test_internvl.py tests/models/multimodal/processing/test_llama4.py tests/models/multimodal/processing/test_llava_next.py tests/models/multimodal/processing/test_llava_onevision.py tests/models/multimodal/processing/test_minimax_vl_01.py tests/models/multimodal/processing/test_mllama.py tests/models/multimodal/processing/test_nemotron_vl.py tests/models/multimodal/processing/test_phi3v.py tests/models/multimodal/processing/test_phi4mm.py tests/models/multimodal/processing/test_qwen2_vl.py tests/models/multimodal/processing/test_smolvlm.py tests/models/multimodal/processing/test_transformers.py tests/models/quantization/__init__.py tests/models/quantization/test_aqlm.py tests/models/quantization/test_awq.py tests/models/quantization/test_bitblas.py tests/models/quantization/test_bitsandbytes.py tests/models/quantization/test_fp8.py tests/models/quantization/test_gguf.py tests/models/quantization/test_gptq_bitblas.py tests/models/quantization/test_gptq_marlin.py tests/models/quantization/test_gptq_marlin_24.py tests/models/quantization/test_modelopt.py tests/models/quantization/test_mxfp4.py tests/models/quantization/test_nvfp4.py tests/mq_llm_engine/__init__.py tests/mq_llm_engine/conftest.py tests/mq_llm_engine/test_abort.py tests/mq_llm_engine/test_error_handling.py tests/mq_llm_engine/test_load.py tests/mq_llm_engine/utils.py tests/multi_step/__init__.py tests/multi_step/test_correctness_async_llm.py tests/multi_step/test_correctness_llm.py tests/multimodal/__init__.py tests/multimodal/test_hasher.py tests/multimodal/test_image.py tests/multimodal/test_inputs.py tests/multimodal/test_processing.py tests/multimodal/test_utils.py tests/multimodal/test_video.py tests/multimodal/utils.py tests/multimodal/assets/image1.png tests/multimodal/assets/image2.png tests/multimodal/assets/rgba.png tests/neuron/1_core/test_activation.py tests/neuron/1_core/test_block_table.py tests/neuron/1_core/test_cache.py tests/neuron/1_core/test_layernorm.py tests/neuron/1_core/test_logits_processor.py tests/neuron/1_core/test_neuron_model_runner.py tests/neuron/1_core/test_neuron_quant.py tests/neuron/1_core/test_prefix_prefill.py tests/neuron/1_core/test_rotary_embedding.py tests/neuron/2_core/test_comm_ops.py tests/neuron/2_core/test_eagle.py tests/neuron/2_core/test_mistral.py tests/neuron/2_core/test_multi_lora.py tests/plugins/lora_resolvers/__init__.py tests/plugins/lora_resolvers/test_filesystem_resolver.py tests/plugins/vllm_add_dummy_model/setup.py tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py tests/plugins/vllm_add_dummy_platform/setup.py tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py tests/plugins_tests/conftest.py tests/plugins_tests/test_platform_plugins.py tests/plugins_tests/test_scheduler_plugins.py tests/prefix_caching/__init__.py tests/prefix_caching/test_disable_sliding_window.py tests/prefix_caching/test_prefix_caching.py tests/prompts/example.txt tests/prompts/summary.txt tests/quantization/__init__.py tests/quantization/reference_mxfp4.py tests/quantization/test_auto_round.py tests/quantization/test_compressed_tensors.py tests/quantization/test_configs.py tests/quantization/test_cpu_offload.py tests/quantization/test_experts_int8.py tests/quantization/test_fp8.py tests/quantization/test_gptq_dynamic.py tests/quantization/test_ipex_quant.py tests/quantization/test_lm_head.py tests/quantization/test_modelopt.py tests/quantization/test_ptpc_fp8.py tests/quantization/test_quark.py tests/quantization/test_register_quantization_config.py tests/quantization/test_rtn.py tests/quantization/test_torchao.py tests/quantization/utils.py tests/reasoning/__init__.py tests/reasoning/test_deepseekr1_reasoning_parser.py tests/reasoning/test_granite_reasoning_parser.py tests/reasoning/test_hunyuan_reasoning_parser.py tests/reasoning/test_mistral_reasoning_parser.py tests/reasoning/test_qwen3_reasoning_parser.py tests/reasoning/utils.py tests/runai_model_streamer_test/__init__.py tests/runai_model_streamer_test/test_runai_model_streamer_loader.py tests/runai_model_streamer_test/test_weight_utils.py tests/samplers/__init__.py tests/samplers/test_beam_search.py tests/samplers/test_ignore_eos.py tests/samplers/test_logits_processor.py tests/samplers/test_logprobs.py tests/samplers/test_no_bad_words.py tests/samplers/test_ranks.py tests/samplers/test_sampler.py tests/samplers/test_seeded_generate.py tests/standalone_tests/lazy_imports.py tests/standalone_tests/python_only_compile.sh tests/standalone_tests/pytorch_nightly_dependency.sh tests/system_messages/sonnet3.5_nov2024.txt tests/tensorizer_loader/__init__.py tests/tensorizer_loader/conftest.py tests/tensorizer_loader/test_tensorizer.py tests/tokenization/__init__.py tests/tokenization/test_cached_tokenizer.py tests/tokenization/test_detokenize.py tests/tokenization/test_do_lower_case.py tests/tokenization/test_get_eos.py tests/tokenization/test_mistral_tokenizer.py tests/tokenization/test_tokenizer.py tests/tokenization/test_tokenizer_group.py tests/tokenization/test_tokenizer_registry.py tests/tool_use/__init__.py tests/tool_use/conftest.py tests/tool_use/test_chat_completion_request_validations.py tests/tool_use/test_chat_completions.py tests/tool_use/test_glm4_moe_tool_parser.py tests/tool_use/test_jamba_tool_parser.py tests/tool_use/test_kimi_k2_tool_parser.py tests/tool_use/test_minimax_tool_parser.py tests/tool_use/test_parallel_tool_calls.py tests/tool_use/test_qwen3coder_tool_parser.py tests/tool_use/test_tool_calls.py tests/tool_use/test_tool_choice_required.py tests/tool_use/test_xlam_tool_parser.py tests/tool_use/utils.py tests/tools/__init__.py tests/tools/test_config_validator.py tests/tpu/__init__.py tests/tpu/test_compilation.py tests/tpu/test_custom_dispatcher.py tests/tpu/test_moe_pallas.py tests/tpu/test_quantization_accuracy.py tests/tpu/lora/__init__.py tests/tpu/lora/test_lora.py tests/tracing/__init__.py tests/tracing/test_tracing.py tests/v1/__init__.py tests/v1/test_async_llm_dp.py tests/v1/test_external_lb_dp.py tests/v1/test_hybrid_lb_dp.py tests/v1/test_internal_lb_dp.py tests/v1/test_metrics_reader.py tests/v1/test_oracle.py tests/v1/test_request.py tests/v1/test_serial_utils.py tests/v1/test_utils.py tests/v1/attention/test_attention_backends.py tests/v1/attention/utils.py tests/v1/core/__init__.py tests/v1/core/test_async_scheduler.py tests/v1/core/test_kv_cache_utils.py tests/v1/core/test_prefix_caching.py tests/v1/core/test_scheduler.py tests/v1/core/test_scheduler_e2e.py tests/v1/core/test_specialized_manager.py tests/v1/core/utils.py tests/v1/e2e/__init__.py tests/v1/e2e/test_cascade_attention.py tests/v1/e2e/test_correctness_sliding_window.py tests/v1/e2e/test_spec_decode.py tests/v1/engine/__init__.py tests/v1/engine/conftest.py tests/v1/engine/test_async_llm.py tests/v1/engine/test_engine_args.py tests/v1/engine/test_engine_core.py tests/v1/engine/test_engine_core_client.py tests/v1/engine/test_fast_incdec_prefix_err.py tests/v1/engine/test_llm_engine.py tests/v1/engine/test_output_processor.py tests/v1/engine/utils.py tests/v1/entrypoints/__init__.py tests/v1/entrypoints/conftest.py tests/v1/entrypoints/llm/__init__.py tests/v1/entrypoints/llm/test_struct_output_generate.py tests/v1/entrypoints/openai/test_chat_completion.py tests/v1/entrypoints/openai/test_completion.py tests/v1/entrypoints/openai/test_multi_api_servers.py tests/v1/entrypoints/openai/responses/__init__.py tests/v1/entrypoints/openai/responses/conftest.py tests/v1/entrypoints/openai/responses/test_basic.py tests/v1/entrypoints/openai/responses/test_image.py tests/v1/entrypoints/openai/responses/test_stateful.py tests/v1/entrypoints/openai/responses/test_structured_output.py tests/v1/kv_connector/__init__.py tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh tests/v1/kv_connector/nixl_integration/test_accuracy.py tests/v1/kv_connector/nixl_integration/test_edge_cases.py tests/v1/kv_connector/nixl_integration/toy_proxy_server.py tests/v1/kv_connector/unit/__init__.py tests/v1/kv_connector/unit/test_multi_connector.py tests/v1/kv_connector/unit/test_nixl_connector.py tests/v1/kv_connector/unit/test_output_aggreagator.py tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py tests/v1/kv_connector/unit/utils.py tests/v1/metrics/test_ray_metrics.py tests/v1/sample/__init__.py tests/v1/sample/test_logits_processors.py tests/v1/sample/test_logprobs.py tests/v1/sample/test_logprobs_e2e.py tests/v1/sample/test_rejection_sampler.py tests/v1/sample/test_sampler.py tests/v1/sample/test_sampling_params_e2e.py tests/v1/sample/test_topk_topp_sampler.py tests/v1/sample/utils.py tests/v1/shutdown/test_delete.py tests/v1/shutdown/test_forward_error.py tests/v1/shutdown/test_processor_error.py tests/v1/shutdown/test_startup_error.py tests/v1/shutdown/utils.py tests/v1/spec_decode/test_eagle.py tests/v1/spec_decode/test_max_len.py tests/v1/spec_decode/test_ngram.py tests/v1/structured_output/__init__.py tests/v1/structured_output/test_utils.py tests/v1/tpu/__init__.py tests/v1/tpu/test_basic.py tests/v1/tpu/test_kv_cache_update_kernel.py tests/v1/tpu/test_mha_attn.py tests/v1/tpu/test_multimodal.py tests/v1/tpu/test_pallas.py tests/v1/tpu/test_perf.py tests/v1/tpu/test_sampler.py tests/v1/tpu/test_spmd_model_weight_loading.py tests/v1/tpu/test_topk_topp_sampler.py tests/v1/tpu/test_tpu_qkv_linear.py tests/v1/tpu/worker/__init__.py tests/v1/tpu/worker/test_tpu_model_runner.py tests/v1/worker/__init__.py tests/v1/worker/test_gpu_input_batch.py tests/v1/worker/test_gpu_model_runner.py tests/vllm_test_utils/setup.py tests/vllm_test_utils/vllm_test_utils/__init__.py tests/vllm_test_utils/vllm_test_utils/blame.py tests/vllm_test_utils/vllm_test_utils/monitor.py tests/weight_loading/models-large.txt tests/weight_loading/models.txt tests/weight_loading/run_model_weight_loading_test.sh tests/weight_loading/test_weight_loading.py tests/worker/__init__.py tests/worker/conftest.py tests/worker/test_encoder_decoder_model_runner.py tests/worker/test_model_input.py tests/worker/test_model_runner.py tests/worker/test_profile.py tests/worker/test_swap.py tools/check_init_lazy_imports.py tools/check_pickle_imports.py tools/check_repo.sh tools/check_spdx_header.py tools/check_triton_import.py tools/enforce_regex_import.py tools/generate_cmake_presets.py tools/generate_nightly_torch_test.py tools/install_nixl.sh tools/mypy.sh tools/png-lint.sh tools/report_build_time_ninja.py tools/shellcheck.sh tools/update-dockerfile-graph.sh tools/validate_config.py tools/ep_kernels/README.md tools/ep_kernels/configure_system_drivers.sh tools/ep_kernels/install_python_libraries.sh tools/ep_kernels/elastic_ep/eep_nvshmem.patch tools/ep_kernels/elastic_ep/install_eep_libraries.sh tools/profiler/print_layerwise_table.py tools/profiler/visualize_layerwise_profile.py vllm/__init__.py vllm/_custom_ops.py vllm/_ipex_ops.py vllm/_version.py vllm/beam_search.py vllm/collect_env.py vllm/config.py vllm/connections.py vllm/env_override.py vllm/envs.py vllm/forward_context.py vllm/jsontree.py vllm/logger.py vllm/logits_process.py vllm/outputs.py vllm/pooling_params.py vllm/py.typed vllm/sampling_params.py vllm/scalar_type.py vllm/scripts.py vllm/sequence.py vllm/test_utils.py vllm/tracing.py vllm/version.py vllm.egg-info/PKG-INFO vllm.egg-info/SOURCES.txt vllm.egg-info/dependency_links.txt vllm.egg-info/entry_points.txt vllm.egg-info/requires.txt vllm.egg-info/top_level.txt vllm/adapter_commons/__init__.py vllm/adapter_commons/layers.py vllm/adapter_commons/models.py vllm/adapter_commons/request.py vllm/adapter_commons/utils.py vllm/adapter_commons/worker_manager.py vllm/assets/__init__.py vllm/assets/audio.py vllm/assets/base.py vllm/assets/image.py vllm/assets/video.py vllm/attention/__init__.py vllm/attention/layer.py vllm/attention/selector.py vllm/attention/backends/__init__.py vllm/attention/backends/abstract.py vllm/attention/backends/differential_flash_attn.py vllm/attention/backends/dual_chunk_flash_attn.py vllm/attention/backends/flash_attn.py vllm/attention/backends/flashinfer.py vllm/attention/backends/flashmla.py vllm/attention/backends/placeholder_attn.py vllm/attention/backends/rocm_aiter_mla.py vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/triton_mla.py vllm/attention/backends/utils.py vllm/attention/backends/xformers.py vllm/attention/backends/mla/__init__.py vllm/attention/backends/mla/common.py vllm/attention/ops/__init__.py vllm/attention/ops/chunked_prefill_paged_decode.py vllm/attention/ops/flashmla.py vllm/attention/ops/merge_attn_states.py vllm/attention/ops/nki_flash_attn.py vllm/attention/ops/paged_attn.py vllm/attention/ops/pallas_kv_cache_update.py vllm/attention/ops/prefix_prefill.py vllm/attention/ops/rocm_aiter_mla.py vllm/attention/ops/rocm_aiter_paged_attn.py vllm/attention/ops/triton_decode_attention.py vllm/attention/ops/triton_flash_attention.py vllm/attention/ops/triton_merge_attn_states.py vllm/attention/ops/triton_unified_attention.py vllm/attention/utils/__init__.py vllm/attention/utils/fa_utils.py vllm/attention/utils/kv_sharing_utils.py vllm/benchmarks/__init__.py vllm/benchmarks/datasets.py vllm/benchmarks/endpoint_request_func.py vllm/benchmarks/latency.py vllm/benchmarks/serve.py vllm/benchmarks/throughput.py vllm/benchmarks/utils.py vllm/compilation/__init__.py vllm/compilation/activation_quant_fusion.py vllm/compilation/backends.py vllm/compilation/base_piecewise_backend.py vllm/compilation/collective_fusion.py vllm/compilation/compiler_interface.py vllm/compilation/counter.py vllm/compilation/cuda_piecewise_backend.py vllm/compilation/decorators.py vllm/compilation/fix_functionalization.py vllm/compilation/fusion.py vllm/compilation/fusion_attn.py vllm/compilation/fx_utils.py vllm/compilation/inductor_pass.py vllm/compilation/monitor.py vllm/compilation/multi_output_match.py vllm/compilation/noop_elimination.py vllm/compilation/pass_manager.py vllm/compilation/sequence_parallelism.py vllm/compilation/torch25_custom_graph_pass.py vllm/compilation/vllm_inductor_pass.py vllm/compilation/wrapper.py vllm/core/__init__.py vllm/core/block_manager.py vllm/core/evictor.py vllm/core/interfaces.py vllm/core/placeholder_block_space_manager.py vllm/core/scheduler.py vllm/core/block/__init__.py vllm/core/block/block_table.py vllm/core/block/common.py vllm/core/block/cpu_gpu_block_allocator.py vllm/core/block/interfaces.py vllm/core/block/naive_block.py vllm/core/block/prefix_caching_block.py vllm/core/block/utils.py vllm/device_allocator/__init__.py vllm/device_allocator/cumem.py vllm/distributed/__init__.py vllm/distributed/communication_op.py vllm/distributed/kv_events.py vllm/distributed/parallel_state.py vllm/distributed/tpu_distributed_utils.py vllm/distributed/utils.py vllm/distributed/device_communicators/__init__.py vllm/distributed/device_communicators/all2all.py vllm/distributed/device_communicators/base_device_communicator.py vllm/distributed/device_communicators/cpu_communicator.py vllm/distributed/device_communicators/cuda_communicator.py vllm/distributed/device_communicators/cuda_wrapper.py vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce_utils.py vllm/distributed/device_communicators/neuron_communicator.py vllm/distributed/device_communicators/pynccl.py vllm/distributed/device_communicators/pynccl_wrapper.py vllm/distributed/device_communicators/quick_all_reduce.py vllm/distributed/device_communicators/shm_broadcast.py vllm/distributed/device_communicators/tpu_communicator.py vllm/distributed/device_communicators/xpu_communicator.py vllm/distributed/eplb/__init__.py vllm/distributed/eplb/eplb_state.py vllm/distributed/eplb/rebalance_algo.py vllm/distributed/eplb/rebalance_execute.py vllm/distributed/kv_transfer/README.md vllm/distributed/kv_transfer/__init__.py vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg vllm/distributed/kv_transfer/kv_connector_agent.py vllm/distributed/kv_transfer/kv_transfer_state.py vllm/distributed/kv_transfer/kv_connector/__init__.py vllm/distributed/kv_transfer/kv_connector/base.py vllm/distributed/kv_transfer/kv_connector/factory.py vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py vllm/distributed/kv_transfer/kv_connector/p2p_connector.py vllm/distributed/kv_transfer/kv_connector/simple_connector.py vllm/distributed/kv_transfer/kv_connector/utils.py vllm/distributed/kv_transfer/kv_connector/v1/__init__.py vllm/distributed/kv_transfer/kv_connector/v1/base.py vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py vllm/distributed/kv_transfer/kv_connector/v1/p2p/flagcx_p2p_nccl_engine.py vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py vllm/distributed/kv_transfer/kv_lookup_buffer/base.py vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py vllm/distributed/kv_transfer/kv_pipe/__init__.py vllm/distributed/kv_transfer/kv_pipe/base.py vllm/distributed/kv_transfer/kv_pipe/flagcx_p2p_nccl_pipe.py vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py vllm/engine/__init__.py vllm/engine/arg_utils.py vllm/engine/async_llm_engine.py vllm/engine/async_timeout.py vllm/engine/llm_engine.py vllm/engine/metrics.py vllm/engine/metrics_types.py vllm/engine/protocol.py vllm/engine/multiprocessing/__init__.py vllm/engine/multiprocessing/client.py vllm/engine/multiprocessing/engine.py vllm/engine/output_processor/__init__.py vllm/engine/output_processor/interfaces.py vllm/engine/output_processor/multi_step.py vllm/engine/output_processor/single_step.py vllm/engine/output_processor/stop_checker.py vllm/engine/output_processor/util.py vllm/entrypoints/__init__.py vllm/entrypoints/api_server.py vllm/entrypoints/chat_utils.py vllm/entrypoints/launcher.py vllm/entrypoints/llm.py vllm/entrypoints/logger.py vllm/entrypoints/score_utils.py vllm/entrypoints/ssl.py vllm/entrypoints/utils.py vllm/entrypoints/cli/__init__.py vllm/entrypoints/cli/collect_env.py vllm/entrypoints/cli/main.py vllm/entrypoints/cli/openai.py vllm/entrypoints/cli/run_batch.py vllm/entrypoints/cli/serve.py vllm/entrypoints/cli/types.py vllm/entrypoints/cli/benchmark/__init__.py vllm/entrypoints/cli/benchmark/base.py vllm/entrypoints/cli/benchmark/latency.py vllm/entrypoints/cli/benchmark/main.py vllm/entrypoints/cli/benchmark/serve.py vllm/entrypoints/cli/benchmark/throughput.py vllm/entrypoints/openai/__init__.py vllm/entrypoints/openai/api_server.py vllm/entrypoints/openai/cli_args.py vllm/entrypoints/openai/logits_processors.py vllm/entrypoints/openai/protocol.py vllm/entrypoints/openai/run_batch.py vllm/entrypoints/openai/serving_chat.py vllm/entrypoints/openai/serving_classification.py vllm/entrypoints/openai/serving_completion.py vllm/entrypoints/openai/serving_embedding.py vllm/entrypoints/openai/serving_engine.py vllm/entrypoints/openai/serving_models.py vllm/entrypoints/openai/serving_pooling.py vllm/entrypoints/openai/serving_responses.py vllm/entrypoints/openai/serving_score.py vllm/entrypoints/openai/serving_tokenization.py vllm/entrypoints/openai/serving_transcription.py vllm/entrypoints/openai/speech_to_text.py vllm/entrypoints/openai/tool_parsers/__init__.py vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py vllm/entrypoints/openai/tool_parsers/utils.py vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py vllm/executor/__init__.py vllm/executor/executor_base.py vllm/executor/mp_distributed_executor.py vllm/executor/msgspec_utils.py vllm/executor/multiproc_worker_utils.py vllm/executor/ray_distributed_executor.py vllm/executor/ray_utils.py vllm/executor/uniproc_executor.py vllm/inputs/__init__.py vllm/inputs/data.py vllm/inputs/parse.py vllm/inputs/preprocess.py vllm/inputs/registry.py vllm/logging_utils/__init__.py vllm/logging_utils/dump_input.py vllm/logging_utils/formatter.py vllm/lora/__init__.py vllm/lora/fully_sharded_layers.py vllm/lora/layers.py vllm/lora/lora.py vllm/lora/models.py vllm/lora/peft_helper.py vllm/lora/request.py vllm/lora/resolver.py vllm/lora/utils.py vllm/lora/worker_manager.py vllm/lora/ops/__init__.py vllm/lora/ops/torch_ops/__init__.py vllm/lora/ops/torch_ops/lora_ops.py vllm/lora/ops/triton_ops/__init__.py vllm/lora/ops/triton_ops/kernel_utils.py vllm/lora/ops/triton_ops/lora_expand_op.py vllm/lora/ops/triton_ops/lora_kernel_metadata.py vllm/lora/ops/triton_ops/lora_shrink_op.py vllm/lora/ops/triton_ops/utils.py vllm/lora/ops/xla_ops/__init__.py vllm/lora/ops/xla_ops/lora_ops.py vllm/lora/punica_wrapper/__init__.py vllm/lora/punica_wrapper/punica_base.py vllm/lora/punica_wrapper/punica_cpu.py vllm/lora/punica_wrapper/punica_gpu.py vllm/lora/punica_wrapper/punica_selector.py vllm/lora/punica_wrapper/punica_tpu.py vllm/lora/punica_wrapper/utils.py vllm/mocks/__init__.py vllm/mocks/mock_nixl_connector.py vllm/model_executor/__init__.py vllm/model_executor/custom_op.py vllm/model_executor/parameter.py vllm/model_executor/pooling_metadata.py vllm/model_executor/sampling_metadata.py vllm/model_executor/utils.py vllm/model_executor/guided_decoding/__init__.py vllm/model_executor/guided_decoding/guidance_decoding.py vllm/model_executor/guided_decoding/guidance_logits_processors.py vllm/model_executor/guided_decoding/guided_fields.py vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py vllm/model_executor/guided_decoding/outlines_decoding.py vllm/model_executor/guided_decoding/outlines_logits_processors.py vllm/model_executor/guided_decoding/utils.py vllm/model_executor/guided_decoding/xgrammar_decoding.py vllm/model_executor/layers/__init__.py vllm/model_executor/layers/activation.py vllm/model_executor/layers/layernorm.py vllm/model_executor/layers/lightning_attn.py vllm/model_executor/layers/linear.py vllm/model_executor/layers/logits_processor.py vllm/model_executor/layers/pooler.py vllm/model_executor/layers/resampler.py vllm/model_executor/layers/rotary_embedding.py vllm/model_executor/layers/sampler.py vllm/model_executor/layers/utils.py vllm/model_executor/layers/vocab_parallel_embedding.py vllm/model_executor/layers/fused_moe/__init__.py vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py vllm/model_executor/layers/fused_moe/config.py vllm/model_executor/layers/fused_moe/cpu_fused_moe.py vllm/model_executor/layers/fused_moe/cutlass_moe.py vllm/model_executor/layers/fused_moe/deep_gemm_moe.py vllm/model_executor/layers/fused_moe/deep_gemm_utils.py vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py vllm/model_executor/layers/fused_moe/fused_batched_moe.py vllm/model_executor/layers/fused_moe/fused_marlin_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/modular_kernel.py vllm/model_executor/layers/fused_moe/moe_align_block_size.py vllm/model_executor/layers/fused_moe/moe_pallas.py vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py vllm/model_executor/layers/fused_moe/moe_torch_iterative.py vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py vllm/model_executor/layers/fused_moe/prepare_finalize.py vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py vllm/model_executor/layers/fused_moe/utils.py vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json vllm/model_executor/layers/fused_moe/configs/README vllm/model_executor/layers/mamba/__init__.py vllm/model_executor/layers/mamba/abstract.py vllm/model_executor/layers/mamba/mamba2_metadata.py vllm/model_executor/layers/mamba/mamba_mixer.py vllm/model_executor/layers/mamba/mamba_mixer2.py vllm/model_executor/layers/mamba/mamba_utils.py vllm/model_executor/layers/mamba/ops/__init__.py vllm/model_executor/layers/mamba/ops/causal_conv1d.py vllm/model_executor/layers/mamba/ops/mamba_ssm.py vllm/model_executor/layers/mamba/ops/ssd_bmm.py vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py vllm/model_executor/layers/mamba/ops/ssd_combined.py vllm/model_executor/layers/mamba/ops/ssd_state_passing.py vllm/model_executor/layers/quantization/__init__.py vllm/model_executor/layers/quantization/aqlm.py vllm/model_executor/layers/quantization/auto_round.py vllm/model_executor/layers/quantization/awq.py vllm/model_executor/layers/quantization/awq_marlin.py vllm/model_executor/layers/quantization/awq_triton.py vllm/model_executor/layers/quantization/base_config.py vllm/model_executor/layers/quantization/bitblas.py vllm/model_executor/layers/quantization/bitsandbytes.py vllm/model_executor/layers/quantization/deepgemm.py vllm/model_executor/layers/quantization/deepspeedfp.py vllm/model_executor/layers/quantization/experts_int8.py vllm/model_executor/layers/quantization/fbgemm_fp8.py vllm/model_executor/layers/quantization/fp8.py vllm/model_executor/layers/quantization/gguf.py vllm/model_executor/layers/quantization/gptq.py vllm/model_executor/layers/quantization/gptq_bitblas.py vllm/model_executor/layers/quantization/gptq_marlin.py vllm/model_executor/layers/quantization/gptq_marlin_24.py vllm/model_executor/layers/quantization/hqq_marlin.py vllm/model_executor/layers/quantization/inc.py vllm/model_executor/layers/quantization/input_quant_fp8.py vllm/model_executor/layers/quantization/ipex_quant.py vllm/model_executor/layers/quantization/kv_cache.py vllm/model_executor/layers/quantization/marlin.py vllm/model_executor/layers/quantization/modelopt.py vllm/model_executor/layers/quantization/moe_wna16.py vllm/model_executor/layers/quantization/neuron_quant.py vllm/model_executor/layers/quantization/ptpc_fp8.py vllm/model_executor/layers/quantization/qqq.py vllm/model_executor/layers/quantization/rtn.py vllm/model_executor/layers/quantization/schema.py vllm/model_executor/layers/quantization/torchao.py vllm/model_executor/layers/quantization/tpu_int8.py vllm/model_executor/layers/quantization/compressed_tensors/__init__.py vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py vllm/model_executor/layers/quantization/compressed_tensors/utils.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py vllm/model_executor/layers/quantization/kernels/__init__.py vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py vllm/model_executor/layers/quantization/quark/__init__.py vllm/model_executor/layers/quantization/quark/quark.py vllm/model_executor/layers/quantization/quark/quark_moe.py vllm/model_executor/layers/quantization/quark/utils.py vllm/model_executor/layers/quantization/quark/schemes/__init__.py vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py vllm/model_executor/layers/quantization/utils/__init__.py vllm/model_executor/layers/quantization/utils/allspark_utils.py vllm/model_executor/layers/quantization/utils/bitblas_utils.py vllm/model_executor/layers/quantization/utils/fp8_utils.py vllm/model_executor/layers/quantization/utils/gptq_utils.py vllm/model_executor/layers/quantization/utils/int8_utils.py vllm/model_executor/layers/quantization/utils/layer_utils.py vllm/model_executor/layers/quantization/utils/machete_utils.py vllm/model_executor/layers/quantization/utils/marlin_utils.py vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py vllm/model_executor/layers/quantization/utils/marlin_utils_test.py vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py vllm/model_executor/layers/quantization/utils/mxfp4_utils.py vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py vllm/model_executor/layers/quantization/utils/quant_utils.py vllm/model_executor/layers/quantization/utils/w8a8_utils.py vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json vllm/model_executor/model_loader/__init__.py vllm/model_executor/model_loader/base_loader.py vllm/model_executor/model_loader/bitsandbytes_loader.py vllm/model_executor/model_loader/default_loader.py vllm/model_executor/model_loader/dummy_loader.py vllm/model_executor/model_loader/gguf_loader.py vllm/model_executor/model_loader/neuron.py vllm/model_executor/model_loader/neuronx_distributed.py vllm/model_executor/model_loader/runai_streamer_loader.py vllm/model_executor/model_loader/sharded_state_loader.py vllm/model_executor/model_loader/tensorizer.py vllm/model_executor/model_loader/tensorizer_loader.py vllm/model_executor/model_loader/tpu.py vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/weight_utils.py vllm/model_executor/models/__init__.py vllm/model_executor/models/adapters.py vllm/model_executor/models/aimv2.py vllm/model_executor/models/arcee.py vllm/model_executor/models/arctic.py vllm/model_executor/models/aria.py vllm/model_executor/models/aya_vision.py vllm/model_executor/models/baichuan.py vllm/model_executor/models/bailing_moe.py vllm/model_executor/models/bamba.py vllm/model_executor/models/bart.py vllm/model_executor/models/bert.py vllm/model_executor/models/bert_with_rope.py vllm/model_executor/models/blip.py vllm/model_executor/models/blip2.py vllm/model_executor/models/bloom.py vllm/model_executor/models/chameleon.py vllm/model_executor/models/chatglm.py vllm/model_executor/models/clip.py vllm/model_executor/models/commandr.py vllm/model_executor/models/config.py vllm/model_executor/models/constant_size_cache.py vllm/model_executor/models/dbrx.py vllm/model_executor/models/deepseek.py vllm/model_executor/models/deepseek_mtp.py vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_vl2.py vllm/model_executor/models/dots1.py vllm/model_executor/models/ernie45.py vllm/model_executor/models/ernie45_moe.py vllm/model_executor/models/exaone.py vllm/model_executor/models/exaone4.py vllm/model_executor/models/fairseq2_llama.py vllm/model_executor/models/falcon.py vllm/model_executor/models/falcon_h1.py vllm/model_executor/models/florence2.py vllm/model_executor/models/fuyu.py vllm/model_executor/models/gemma.py vllm/model_executor/models/gemma2.py vllm/model_executor/models/gemma3.py vllm/model_executor/models/gemma3_mm.py vllm/model_executor/models/gemma3n.py vllm/model_executor/models/glm.py vllm/model_executor/models/glm4.py vllm/model_executor/models/glm4_1v.py vllm/model_executor/models/glm4_moe.py vllm/model_executor/models/glm4_moe_mtp.py vllm/model_executor/models/glm4v.py vllm/model_executor/models/gpt2.py vllm/model_executor/models/gpt_bigcode.py vllm/model_executor/models/gpt_j.py vllm/model_executor/models/gpt_neox.py vllm/model_executor/models/granite.py vllm/model_executor/models/granite_speech.py vllm/model_executor/models/granitemoe.py vllm/model_executor/models/granitemoehybrid.py vllm/model_executor/models/granitemoeshared.py vllm/model_executor/models/gritlm.py vllm/model_executor/models/grok1.py vllm/model_executor/models/h2ovl.py vllm/model_executor/models/hunyuan_v1.py vllm/model_executor/models/idefics2_vision_model.py vllm/model_executor/models/idefics3.py vllm/model_executor/models/interfaces.py vllm/model_executor/models/interfaces_base.py vllm/model_executor/models/intern_vit.py vllm/model_executor/models/internlm2.py vllm/model_executor/models/internlm2_ve.py vllm/model_executor/models/internvl.py vllm/model_executor/models/jais.py vllm/model_executor/models/jamba.py vllm/model_executor/models/jina_vl.py vllm/model_executor/models/keye.py vllm/model_executor/models/kimi_vl.py vllm/model_executor/models/llama.py vllm/model_executor/models/llama4.py vllm/model_executor/models/llama4_eagle.py vllm/model_executor/models/llama_eagle.py vllm/model_executor/models/llama_eagle3.py vllm/model_executor/models/llava.py vllm/model_executor/models/llava_next.py vllm/model_executor/models/llava_next_video.py vllm/model_executor/models/llava_onevision.py vllm/model_executor/models/mamba.py vllm/model_executor/models/mamba2.py vllm/model_executor/models/mamba_cache.py vllm/model_executor/models/medusa.py vllm/model_executor/models/mimo.py vllm/model_executor/models/mimo_mtp.py vllm/model_executor/models/minicpm.py vllm/model_executor/models/minicpm3.py vllm/model_executor/models/minicpm_eagle.py vllm/model_executor/models/minicpmo.py vllm/model_executor/models/minicpmv.py vllm/model_executor/models/minimax_cache.py vllm/model_executor/models/minimax_text_01.py vllm/model_executor/models/minimax_vl_01.py vllm/model_executor/models/mistral3.py vllm/model_executor/models/mixtral.py vllm/model_executor/models/mixtral_quant.py vllm/model_executor/models/mllama.py vllm/model_executor/models/mllama4.py vllm/model_executor/models/mlp_speculator.py vllm/model_executor/models/modernbert.py vllm/model_executor/models/module_mapping.py vllm/model_executor/models/molmo.py vllm/model_executor/models/moonvit.py vllm/model_executor/models/mpt.py vllm/model_executor/models/nemotron.py vllm/model_executor/models/nemotron_h.py vllm/model_executor/models/nemotron_nas.py vllm/model_executor/models/nemotron_vl.py vllm/model_executor/models/nvlm_d.py vllm/model_executor/models/olmo.py vllm/model_executor/models/olmo2.py vllm/model_executor/models/olmoe.py vllm/model_executor/models/opt.py vllm/model_executor/models/orion.py vllm/model_executor/models/ovis.py vllm/model_executor/models/paligemma.py vllm/model_executor/models/persimmon.py vllm/model_executor/models/phi.py vllm/model_executor/models/phi3.py vllm/model_executor/models/phi3v.py vllm/model_executor/models/phi4flash.py vllm/model_executor/models/phi4mm.py vllm/model_executor/models/phi4mm_audio.py vllm/model_executor/models/phi4mm_utils.py vllm/model_executor/models/phimoe.py vllm/model_executor/models/pixtral.py vllm/model_executor/models/plamo2.py vllm/model_executor/models/prithvi_geospatial_mae.py vllm/model_executor/models/qwen.py vllm/model_executor/models/qwen2.py vllm/model_executor/models/qwen2_5_omni_thinker.py vllm/model_executor/models/qwen2_5_vl.py vllm/model_executor/models/qwen2_audio.py vllm/model_executor/models/qwen2_moe.py vllm/model_executor/models/qwen2_rm.py vllm/model_executor/models/qwen2_vl.py vllm/model_executor/models/qwen3.py vllm/model_executor/models/qwen3_moe.py vllm/model_executor/models/qwen_vl.py vllm/model_executor/models/registry.py vllm/model_executor/models/roberta.py vllm/model_executor/models/siglip.py vllm/model_executor/models/skyworkr1v.py vllm/model_executor/models/smolvlm.py vllm/model_executor/models/solar.py vllm/model_executor/models/stablelm.py vllm/model_executor/models/starcoder2.py vllm/model_executor/models/tarsier.py vllm/model_executor/models/telechat2.py vllm/model_executor/models/teleflm.py vllm/model_executor/models/transformers.py vllm/model_executor/models/ultravox.py vllm/model_executor/models/utils.py vllm/model_executor/models/vision.py vllm/model_executor/models/voxtral.py vllm/model_executor/models/whisper.py vllm/model_executor/models/zamba2.py vllm/multimodal/__init__.py vllm/multimodal/audio.py vllm/multimodal/base.py vllm/multimodal/hasher.py vllm/multimodal/image.py vllm/multimodal/inputs.py vllm/multimodal/parse.py vllm/multimodal/processing.py vllm/multimodal/profiling.py vllm/multimodal/registry.py vllm/multimodal/utils.py vllm/multimodal/video.py vllm/platforms/__init__.py vllm/platforms/cpu.py vllm/platforms/cuda.py vllm/platforms/interface.py vllm/platforms/neuron.py vllm/platforms/rocm.py vllm/platforms/tpu.py vllm/platforms/xpu.py vllm/plugins/__init__.py vllm/plugins/lora_resolvers/README.md vllm/plugins/lora_resolvers/__init__.py vllm/plugins/lora_resolvers/filesystem_resolver.py vllm/profiler/__init__.py vllm/profiler/layerwise_profile.py vllm/profiler/utils.py vllm/ray/__init__.py vllm/ray/ray_env.py vllm/reasoning/__init__.py vllm/reasoning/abs_reasoning_parsers.py vllm/reasoning/deepseek_r1_reasoning_parser.py vllm/reasoning/glm4_moe_reasoning_parser.py vllm/reasoning/granite_reasoning_parser.py vllm/reasoning/hunyuan_a13b_reasoning_parser.py vllm/reasoning/mistral_reasoning_parser.py vllm/reasoning/qwen3_reasoning_parser.py vllm/third_party/__init__.py vllm/third_party/pynvml.py vllm/transformers_utils/__init__.py vllm/transformers_utils/config.py vllm/transformers_utils/detokenizer.py vllm/transformers_utils/detokenizer_utils.py vllm/transformers_utils/processor.py vllm/transformers_utils/s3_utils.py vllm/transformers_utils/tokenizer.py vllm/transformers_utils/tokenizer_base.py vllm/transformers_utils/tokenizer_group.py vllm/transformers_utils/utils.py vllm/transformers_utils/chat_templates/__init__.py vllm/transformers_utils/chat_templates/registry.py vllm/transformers_utils/chat_templates/template_basic.jinja vllm/transformers_utils/chat_templates/template_blip2.jinja vllm/transformers_utils/chat_templates/template_chatml.jinja vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja vllm/transformers_utils/chat_templates/template_fuyu.jinja vllm/transformers_utils/configs/__init__.py vllm/transformers_utils/configs/arctic.py vllm/transformers_utils/configs/chatglm.py vllm/transformers_utils/configs/cohere2.py vllm/transformers_utils/configs/dbrx.py vllm/transformers_utils/configs/deepseek_vl2.py vllm/transformers_utils/configs/eagle.py vllm/transformers_utils/configs/exaone.py vllm/transformers_utils/configs/exaone4.py vllm/transformers_utils/configs/falcon.py vllm/transformers_utils/configs/jais.py vllm/transformers_utils/configs/kimi_vl.py vllm/transformers_utils/configs/medusa.py vllm/transformers_utils/configs/minimax_text_01.py vllm/transformers_utils/configs/minimax_vl_01.py vllm/transformers_utils/configs/mistral.py vllm/transformers_utils/configs/mllama.py vllm/transformers_utils/configs/mlp_speculator.py vllm/transformers_utils/configs/moonvit.py vllm/transformers_utils/configs/mpt.py vllm/transformers_utils/configs/nemotron.py vllm/transformers_utils/configs/nemotron_h.py vllm/transformers_utils/configs/nemotron_vl.py vllm/transformers_utils/configs/nvlm_d.py vllm/transformers_utils/configs/ovis.py vllm/transformers_utils/configs/skyworkr1v.py vllm/transformers_utils/configs/solar.py vllm/transformers_utils/configs/telechat2.py vllm/transformers_utils/configs/ultravox.py vllm/transformers_utils/processors/__init__.py vllm/transformers_utils/processors/deepseek_vl2.py vllm/transformers_utils/processors/ovis.py vllm/transformers_utils/tokenizers/__init__.py vllm/transformers_utils/tokenizers/mistral.py vllm/triton_utils/__init__.py vllm/triton_utils/importing.py vllm/usage/__init__.py vllm/usage/usage_lib.py vllm/utils/__init__.py vllm/utils/deep_gemm.py vllm/utils/flashinfer.py vllm/v1/__init__.py vllm/v1/kv_cache_interface.py vllm/v1/outputs.py vllm/v1/request.py vllm/v1/serial_utils.py vllm/v1/utils.py vllm/v1/attention/__init__.py vllm/v1/attention/backends/__init__.py vllm/v1/attention/backends/cpu_attn.py vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flashinfer.py vllm/v1/attention/backends/flex_attention.py vllm/v1/attention/backends/mamba_attn.py vllm/v1/attention/backends/pallas.py vllm/v1/attention/backends/rocm_aiter_fa.py vllm/v1/attention/backends/triton_attn.py vllm/v1/attention/backends/utils.py vllm/v1/attention/backends/mla/__init__.py vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/cutlass_mla.py vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/rocm_aiter_mla.py vllm/v1/attention/backends/mla/triton_mla.py vllm/v1/core/__init__.py vllm/v1/core/block_pool.py vllm/v1/core/encoder_cache_manager.py vllm/v1/core/kv_cache_coordinator.py vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_utils.py vllm/v1/core/single_type_kv_cache_manager.py vllm/v1/core/sched/__init__.py vllm/v1/core/sched/async_scheduler.py vllm/v1/core/sched/interface.py vllm/v1/core/sched/output.py vllm/v1/core/sched/request_queue.py vllm/v1/core/sched/scheduler.py vllm/v1/core/sched/utils.py vllm/v1/engine/__init__.py vllm/v1/engine/async_llm.py vllm/v1/engine/coordinator.py vllm/v1/engine/core.py vllm/v1/engine/core_client.py vllm/v1/engine/detokenizer.py vllm/v1/engine/exceptions.py vllm/v1/engine/llm_engine.py vllm/v1/engine/logprobs.py vllm/v1/engine/mm_input_cache.py vllm/v1/engine/output_processor.py vllm/v1/engine/parallel_sampling.py vllm/v1/engine/processor.py vllm/v1/engine/utils.py vllm/v1/executor/__init__.py vllm/v1/executor/abstract.py vllm/v1/executor/multiproc_executor.py vllm/v1/executor/ray_distributed_executor.py vllm/v1/metrics/__init__.py vllm/v1/metrics/loggers.py vllm/v1/metrics/prometheus.py vllm/v1/metrics/ray_wrappers.py vllm/v1/metrics/reader.py vllm/v1/metrics/stats.py vllm/v1/pool/__init__.py vllm/v1/pool/metadata.py vllm/v1/sample/__init__.py vllm/v1/sample/logits_processor.py vllm/v1/sample/metadata.py vllm/v1/sample/rejection_sampler.py vllm/v1/sample/sampler.py vllm/v1/sample/ops/__init__.py vllm/v1/sample/ops/bad_words.py vllm/v1/sample/ops/logprobs.py vllm/v1/sample/ops/penalties.py vllm/v1/sample/ops/topk_topp_sampler.py vllm/v1/sample/tpu/__init__.py vllm/v1/sample/tpu/metadata.py vllm/v1/sample/tpu/sampler.py vllm/v1/spec_decode/__init__.py vllm/v1/spec_decode/eagle.py vllm/v1/spec_decode/medusa.py vllm/v1/spec_decode/metadata.py vllm/v1/spec_decode/metrics.py vllm/v1/spec_decode/ngram_proposer.py vllm/v1/spec_decode/utils.py vllm/v1/structured_output/__init__.py vllm/v1/structured_output/backend_guidance.py vllm/v1/structured_output/backend_outlines.py vllm/v1/structured_output/backend_types.py vllm/v1/structured_output/backend_xgrammar.py vllm/v1/structured_output/request.py vllm/v1/structured_output/utils.py vllm/v1/worker/__init__.py vllm/v1/worker/block_table.py vllm/v1/worker/cpu_model_runner.py vllm/v1/worker/cpu_worker.py vllm/v1/worker/gpu_input_batch.py vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_worker.py vllm/v1/worker/lora_model_runner_mixin.py vllm/v1/worker/tpu_input_batch.py vllm/v1/worker/tpu_model_runner.py vllm/v1/worker/tpu_worker.py vllm/v1/worker/utils.py vllm/v1/worker/worker_base.py vllm/v1/worker/xpu_model_runner.py vllm/v1/worker/xpu_worker.py vllm/vllm_flash_attn/.gitkeep vllm/worker/__init__.py vllm/worker/cache_engine.py vllm/worker/enc_dec_model_runner.py vllm/worker/model_runner.py vllm/worker/model_runner_base.py vllm/worker/multi_step_model_runner.py vllm/worker/multi_step_neuron_model_runner.py vllm/worker/multi_step_neuronx_distributed_model_runner.py vllm/worker/multi_step_worker.py vllm/worker/neuron_model_runner.py vllm/worker/neuron_worker.py vllm/worker/neuronx_distributed_model_runner.py vllm/worker/pooling_model_runner.py vllm/worker/utils.py vllm/worker/worker.py vllm/worker/worker_base.py