<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
  xmlns:xhtml="http://www.w3.org/1999/xhtml">
  <url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/paper-index/</loc>
    <lastmod>2026-02-10T11:44:31+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/paper-index/</loc>
    <lastmod>2025-08-03T16:59:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/paper-index/</loc>
    <lastmod>2025-08-03T16:59:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/research/introcution-to-ggml-llama-cpp/ggml-with-c&#43;&#43;/</loc>
    <lastmod>2026-03-18T14:58:15+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/llm-related-math/matrix-math/</loc>
    <lastmod>2026-03-20T11:08:34+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/llm-related-math/probalility-and-statistics/</loc>
    <lastmod>2026-03-20T11:08:34+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev/cli-notes/</loc>
    <lastmod>2026-04-03T17:02:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/research/ml-architectures/linear-attention/</loc>
    <lastmod>2026-03-18T14:58:15+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/research/ml-architectures/lstm/</loc>
    <lastmod>2026-03-18T14:58:15+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/research/ml-architectures/mamba/</loc>
    <lastmod>2026-03-18T14:58:15+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/research/ml-architectures/rnn/</loc>
    <lastmod>2026-03-18T14:58:15+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/triton/0-notes-and-paradigm/</loc>
    <lastmod>2026-03-20T11:08:34+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/torch-python/</loc>
    <lastmod>2026-03-20T11:08:34+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/server-management/</loc>
    <lastmod>2026-01-08T19:48:56+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/triton/1-builtin-tutorial/</loc>
    <lastmod>2026-03-20T11:08:34+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/software-router/</loc>
    <lastmod>2026-01-08T19:48:56+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev/sync-dot-files/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev/clang-on-apple/</loc>
    <lastmod>2025-09-12T20:09:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev/git-essentials/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev/vscode-essentials/</loc>
    <lastmod>2025-08-10T15:29:26+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/deep-learning-basic/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/numpy-notes/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/build-a-large-language-model-from-scratch/build-a-large-language-model-from-scratch-reading-note/</loc>
    <lastmod>2025-08-18T21:17:03+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/useful-websites/</loc>
    <lastmod>2025-09-12T20:09:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/go-mistakes/</loc>
    <lastmod>2025-08-03T16:59:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/macos-shortcuts/</loc>
    <lastmod>2025-08-03T16:59:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/dev-operations/</loc>
    <lastmod>2025-08-10T15:29:26+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/the-usage-of-jj/</loc>
    <lastmod>2025-08-10T16:08:20+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/chrome-plugin/</loc>
    <lastmod>2025-08-04T20:58:51+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/tips-and-guidance-for-writing/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/pass-the-wall/</loc>
    <lastmod>2026-01-08T19:48:56+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/docker-network/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/using-rust-note/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter89/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/100-exercises-to-learn-rust/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter7/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter6/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter5/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter2-code-and-project-organization/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chatpter3-data-types/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/100-go-mistakes/chapter4-control-structures/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/golang-errors/</loc>
    <lastmod>2025-08-03T16:59:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/golang-struct-and-interface/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/rustlings/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/aerospace/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/windows-mouse-action-on-unix/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/macos/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/pieces/a-piece-of-golang-profile/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/missing-semester-lecture-5/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/missing-semester-lecture-4/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/missing-semester-lecture-2/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/missing-semester-lecture-1/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/watchdog-in-openfaas/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/storage/log/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/transaction-anomalies/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/learning/programming-rust-booknote/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/pieces/a-piece-of-go-test/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/talk-about-postgres-visibility-check-rules/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/pieces/a-piece-of-threadlocal/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/talk-about-wsl2-network-mode-problems/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-9-consistency-and-consensus/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/6.824-lab2b-research/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/talk-about-consistency-and-consensus/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/about-cap/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-8-the-trouble-with-distributed-systems/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/talks/talk-about-redlock/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-7-transactions/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-6-partioning/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-5-replication/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-4-encoding-and-evolution/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-2-data-models-and-query-languages/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/books/design-data-intensive-applications/chapter-3-storage-and-retrieval/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/</loc>
    <lastmod>2026-04-03T17:02:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/</loc>
    <lastmod>2026-04-03T17:02:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/</loc>
    <lastmod>2026-04-03T17:02:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/tobetagged/</loc>
    <lastmod>2026-04-03T17:02:57+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/cascade-speculative-drafting-for-even-faster-llm-inference/</loc>
    <lastmod>2026-02-10T11:44:31+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/neurips-24/</loc>
    <lastmod>2026-02-10T11:44:31+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/speculative-decoding/</loc>
    <lastmod>2026-02-10T11:44:31+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/3-model-speculative-decoding/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/acl-24/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/layerskip-enabling-early-exit-inference-and-self-speculative-decoding/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/neurips-25-workshop/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/draft--verify-lossless-large-language-model-acceleration-via-self-speculative-decoding/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/iclr-25/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/swift-on-the-fly-self-speculative-decoding-for-llm-inference-acceleration/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/aclfindings-25/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/cas-spec-cascade-adaptive-self-speculative-decoding-for-on-the-fly-lossless-inference-acceleration-of-llms/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/hierarchical-speculative-decoding-with-dynamic-windows-for-efficient-language-model-inference/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/neurips-25/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/aiconfigurator-lightning-fast-configuration-optimization-for-multi-framework-llm-serving/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-2601/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/auto-tuning/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/llm-serving/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/revati-transparent-gpu-free-time-warp-emulation-for-llm-serving/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-25/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-2507/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/estimating-llm-uncertainty-with-evidence/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/r-stitch-dynamic-trajectory-stitching-for-efficient-reasoning/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/flexprefill-a-context-aware-sparse-attention-mechanism-for-efficient-long-sequence-inference/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/icml-25/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sparse-attention/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/xattention-block-sparse-attention-with-antidiagonal-scoring/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/dev/</loc>
    <lastmod>2026-01-08T19:48:56+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-2502/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/beyond-the-80-20-rule-high-entropy-minority-tokens-drive-effective-reinforcement-learning-for-llm-reasoning/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/entropy-adaptive-decoding-dynamic-model-switching-for-efficient-inference/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/icml-24-workshop/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/think-big-generate-quick-llm-to-slm-for-fast-autoregressive-decoding/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sec-25/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/sled-a-speculative-llm-decoding-framework-for-efficient-edge-serving/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/atc-25/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/kvcache/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/kvcache-cache-in-the-wild-characterizing-and-optimizing-kvcache-cache-at-a-large-cloud-provider/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-2504/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/token-level-routing-inference-system-for-edge-devices/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/ktransformers-unleashing-the-full-potential-of-cpu-gpu-hybrid-inference-for-moe-models/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/llm-inference/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sosp-25/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/icml-23/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/mlsys-25/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/qserve-w4a8kv4-quantization-and-system-co-design-for-efficient-llm-serving/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/quantization/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/smoothquant-accurate-and-efficient-post-training-quantization-for-large-language-models/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/lserve-efficient-long-sequence-llm-serving-with-unified-sparse-attention/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/duoattention-efficient-long-context-llm-inference-with-retrieval-and-streaming-heads/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/efficient-streaming-language-models-with-attention-sinks/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/iclr-24/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/icml-24/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/long-context/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/quest-query-aware-sparsity-for-efficient-long-context-llm-inference/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/a-dynamic-parallel-method-for-performance-optimization-on-hybrid-cpus/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-24/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/dynamic-sparse-attention-on-mobile-socs/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/mobile/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/eagle-speculative-sampling-requires-rethinking-feature-uncertainty/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/distserve-disaggregating-prefill-and-decoding-for-goodput-optimized-large-language-model-serving/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/isca24/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/osdi-24/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/splitwise-efficient-generative-llm-inference-using-phase-splitting/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/taming-throughput-latency-tradeoff-in-llm-inference-with-sarathi-serve/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/edge-devices/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/scaling-up-on-device-llms-via-active-weight-swapping-between-dram-and-flash/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sparsity/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/elms-elasticized-large-language-models-on-mobile-devices/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/asplos-23/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/llm/</loc>
    <lastmod>2025-08-26T09:38:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/smallthinker-a-family-of-efficient-large-language-models-natively-trained-for-local-deployment/</loc>
    <lastmod>2025-08-26T09:38:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/sti-turbocharge-nlp-inference-at-the-edge-via-elastic-pipelining/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/edgemoe-empowering-sparse-large-language-models-on-mobile-devices/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/gpu/</loc>
    <lastmod>2025-08-26T09:38:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/heterollm-accelerating-large-language-model-inference-on-mobile-socs-with-heterogeneous-ai-accelerators/</loc>
    <lastmod>2025-08-26T09:38:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/ieee-tmc-25/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/moe/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/npu/</loc>
    <lastmod>2025-08-26T09:38:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/a-survey-of-resource-efficient-llm-and-multimodal-foundation-models/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/h2o-heavy-hitter-oracle-for-efficient-generative-inference-of-large-language-models/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/nips-23/</loc>
    <lastmod>2026-02-02T22:58:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/survey/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/llm-as-a-system-service-on-mobile-devices/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-23/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/kv-runahead-scalable-causal-llm-inference-by-parallel-key-value-cache-generation/</loc>
    <lastmod>2025-11-10T23:07:02+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/ring-attention-with-blockwise-transformers-for-near-infinite-context/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sequence-parallelism/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/striped-attention-faster-ring-attention-for-causal-transformers/</loc>
    <lastmod>2025-10-04T16:45:46+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/tensor-parallelism/</loc>
    <lastmod>2025-11-10T23:07:02+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/tpi-llm-serving-70b-scale-llms-efficiently-on-low-resource-mobile-devices/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/tsc-24/</loc>
    <lastmod>2025-08-26T22:31:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/llm.int8-8-bit-matrix-multiplication-for-transformers-at-scale/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/nips2022/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/misc/llm-generated-content/</loc>
    <lastmod>2025-08-12T18:23:35+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/misc./</loc>
    <lastmod>2025-08-12T18:23:35+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/deja-vu-contextual-sparsity-for-efficient-llms-at-inference-time/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/fast-on-device-llm-inference-with-npus/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/llm-preliminaries/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/theoretical/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/commodity-gpu/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/llm-in-a-flash-efficient-large-language-model-inference-with-limited-memory/</loc>
    <lastmod>2026-02-09T23:45:30+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/powerinfer-2-fast-large-language-model-inference-on-a-smartphone/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/serving-on-edge/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/awq-activation-aware-weight-quantization-for-llm-compression-and-acceleration/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/powerinfer-fast-large-language-model-serving-with-a-consumer-grade-gpu/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/sosp-24/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/arxiv-21/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/asplos-24/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/flexgen-high-throughput-generative-inference-of-large-language-models-with-a-single-gpu/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/llm-tuning/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/lora-low-rank-adaptation-of-large-language-models/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/specinfer-accelerating-large-language-model-serving-with-tree-based-speculative-inference-and-verification/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/edgellm-fast-on-device-llm-inference-with-speculative-decoding/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/efficient-memory-management-for-large-language-model-serving-with-pagedattention/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/memory-management/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/a-survey-on-efficient-inference-for-large-language-models/</loc>
    <lastmod>2025-08-25T23:06:01+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/distributed-transactions/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/sonata-multi-database-transactions-made-fast-and-serializable/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/vldb-25/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/concurrency-control-as-a-service/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/orca-a-distributed-serving-system-for-transformer-based-generative-models/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/osdi-22/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/schedule/</loc>
    <lastmod>2025-08-19T09:02:08+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/edgeshard-efficient-llm-inference-via-collaborative-edge-computing/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/ieee-iot-j-25/</loc>
    <lastmod>2025-09-02T09:27:44+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/serverless/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/llm/serverlessllm-locality-enhanced-serverless-inference-for-large-language-models/</loc>
    <lastmod>2025-09-01T22:40:37+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/reading-note/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/programming/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/rust/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/100-mistakes-in-golang/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/learning/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/tools/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/tutorial/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/about/</loc>
    <lastmod>2025-07-28T21:01:24+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/golang/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/course/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/the-missing-semester/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/faas/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/data-management-in-microservices/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/paper-note/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/talk/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/taking-omid-to-the-clouds/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/omid-reloaded/</loc>
    <lastmod>2025-11-17T22:21:48+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/programming-rust/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/towards-transaction-as-a-service/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/scalable-distributed-transactions-across-heterogeneous-stores/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/grit/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/how-to-read-a-paper/</loc>
    <lastmod>2025-04-05T15:44:39+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/ad-hoc-transactions-in-web-applications/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/concurrency/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/java/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/cobra/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/transactions/epoxy/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/zab/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/design-data-intensive-applications/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/cap-twelve-years-later/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/6.824/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/tags/distributed/</loc>
    <lastmod>2025-08-01T17:10:36+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/chain-replication/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/time-clocks-and-the-ordering/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/chubby/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/bigtable/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/mapreduce/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/posts/papers/distributed-systems/zookeeper/</loc>
    <lastmod>2025-08-01T16:47:32+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/archives/</loc>
    <lastmod>2024-11-28T00:32:21+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/categories/</loc>
  </url><url>
    <loc>http://blog.kkkzoz.top/search/</loc>
    <lastmod>2024-11-28T00:32:21+08:00</lastmod>
  </url><url>
    <loc>http://blog.kkkzoz.top/series/</loc>
  </url>
</urlset>
