[LLM] Hugging FaceのSpaceにuploadする

Hugging Face Spaces に Llama / Mistral のチャットデモを公開する最小構成

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch

# ここを Llama / Mistral など好きなモデルに変更
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"  # ← Llama に変更したい場合

# モデルとトークナイザのロード
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

def chat_fn(message, history):
    # 過去履歴を LLM のプロンプト形式に変換
    prompt = ""
    for user, assistant in history:
        prompt += f"<s>[ユーザー]: {user}\n[アシスタント]: {assistant}</s>\n"
    prompt += f"<s>[ユーザー]: {message}\n[アシスタント]:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output_ids = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )

    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 最後のアシスタント発言だけ抽出
    if "[アシスタント]:" in response:
        response = response.split("[アシスタント]:")[-1].strip()

    history.append((message, response))
    return response, history


# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🦙💬 Simple Llama / Mistral Chatbot")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Message")

    def user_send(user_message, chat_history):
        return "", chat_history + [[user_message, None]]

    msg.submit(user_send, [msg, chatbot], [msg, chatbot]).then(
        chat_fn, [msg, chatbot], [chatbot]
    )

demo.launch()

===== Application Startup at 2025-11-16 11:06:22 =====

tokenizer_config.json: 0%| | 0.00/2.10k [00:00