Hugging Face Spaces に Llama / Mistral のチャットデモを公開する最小構成
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
# ここを Llama / Mistral など好きなモデルに変更
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" # ← Llama に変更したい場合
# モデルとトークナイザのロード
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto"
)
def chat_fn(message, history):
# 過去履歴を LLM のプロンプト形式に変換
prompt = ""
for user, assistant in history:
prompt += f"<s>[ユーザー]: {user}\n[アシスタント]: {assistant}</s>\n"
prompt += f"<s>[ユーザー]: {message}\n[アシスタント]:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output_ids = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
top_p=0.9
)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# 最後のアシスタント発言だけ抽出
if "[アシスタント]:" in response:
response = response.split("[アシスタント]:")[-1].strip()
history.append((message, response))
return response, history
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🦙💬 Simple Llama / Mistral Chatbot")
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Message")
def user_send(user_message, chat_history):
return "", chat_history + [[user_message, None]]
msg.submit(user_send, [msg, chatbot], [msg, chatbot]).then(
chat_fn, [msg, chatbot], [chatbot]
)
demo.launch()
===== Application Startup at 2025-11-16 11:06:22 =====
tokenizer_config.json: 0%| | 0.00/2.10k [00:00, ?B/s] tokenizer_config.json: 100%|██████████| 2.10k/2.10k [00:00<00:00, 21.2MB/s] tokenizer.model: 0%| | 0.00/493k [00:00, ?B/s] tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 2.32MB/s] tokenizer.json: 0%| | 0.00/1.80M [00:00, ?B/s] tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 34.2MB/s] special_tokens_map.json: 0%| | 0.00/414 [00:00, ?B/s] special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 4.64MB/s] config.json: 0%| | 0.00/596 [00:00, ?B/s] config.json: 100%|██████████| 596/596 [00:00<00:00, 6.79MB/s] `torch_dtype` is deprecated! Use `dtype` instead! model.safetensors.index.json: 0%| | 0.00/25.1k [00:00, ?B/s] model.safetensors.index.json: 100%|██████████| 25.1k/25.1k [00:00<00:00, 149MB/s] model-00001-of-00003.safetensors: 0%| | 0.00/4.94G [00:00, ?B/s] model-00001-of-00003.safetensors: 1%|▏ | 67.1M/4.94G [00:03<04:32, 17.9MB/s] model-00001-of-00003.safetensors: 2%|▏ | 109M/4.94G [00:04<03:22, 23.8MB/s] model-00001-of-00003.safetensors: 6%|▋ | 316M/4.94G [00:06<01:20, 57.6MB/s] model-00001-of-00003.safetensors: 40%|████ | 1.99G/4.94G [00:07<00:06, 434MB/s] model-00001-of-00003.safetensors: 70%|███████ | 3.47G/4.94G [00:09<00:02, 680MB/s] model-00001-of-00003.safetensors: 89%|████████▉ | 4.41G/4.94G [00:10<00:00, 722MB/s] model-00001-of-00003.safetensors: 100%|██████████| 4.94G/4.94G [00:10<00:00, 457MB/s] model-00002-of-00003.safetensors: 0%| | 0.00/5.00G [00:00, ?B/s] model-00002-of-00003.safetensors: 0%| | 11.5M/5.00G [00:01<12:20, 6.74MB/s] model-00002-of-00003.safetensors: 2%|▏ | 91.5M/5.00G [00:02<02:02, 40.0MB/s] model-00002-of-00003.safetensors: 7%|▋ | 344M/5.00G [00:03<00:39, 118MB/s] model-00002-of-00003.safetensors: 12%|█▏ | 625M/5.00G [00:04<00:25, 174MB/s] model-00002-of-00003.safetensors: 25%|██▌ | 1.27G/5.00G [00:05<00:11, 321MB/s] model-00002-of-00003.safetensors: 33%|███▎ | 1.66G/5.00G [00:06<00:09, 340MB/s] model-00002-of-00003.safetensors: 60%|█████▉ | 2.98G/5.00G [00:07<00:03, 638MB/s] model-00002-of-00003.safetensors: 77%|███████▋ | 3.86G/5.00G [00:09<00:01, 703MB/s] model-00002-of-00003.safetensors: 100%|██████████| 5.00G/5.00G [00:09<00:00, 523MB/s] model-00003-of-00003.safetensors: 0%| | 0.00/4.54G [00:00, ?B/s] model-00003-of-00003.safetensors: 1%|▏ | 67.1M/4.54G [00:04<05:27, 13.6MB/s] model-00003-of-00003.safetensors: 3%|▎ | 114M/4.54G [00:05<03:32, 20.8MB/s] model-00003-of-00003.safetensors: 13%|█▎ | 584M/4.54G [00:07<00:31, 127MB/s] model-00003-of-00003.safetensors: 47%|████▋ | 2.13G/4.54G [00:08<00:04, 483MB/s] model-00003-of-00003.safetensors: 87%|████████▋ | 3.94G/4.54G [00:10<00:01, 581MB/s] model-00003-of-00003.safetensors: 100%|██████████| 4.54G/4.54G [00:10<00:00, 414MB/s] Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s] Loading checkpoint shards: 33%|███▎ | 1/3 [00:07<00:15, 7.71s/it] Loading checkpoint shards: 67%|██████▋ | 2/3 [00:16<00:08, 8.08s/it] Loading checkpoint shards: 100%|██████████| 3/3 [00:24<00:00, 8.06s/it] Loading checkpoint shards: 100%|██████████| 3/3 [00:24<00:00, 8.03s/it] generation_config.json: 0%| | 0.00/111 [00:00, ?B/s] generation_config.json: 100%|██████████| 111/111 [00:00<00:00, 460kB/s] /app/app.py:47: UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys. chatbot = gr.Chatbot() * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr_mode=False` in `launch()`) * To create a public link, set `share=True` in `launch()`.