ソフトウェアエンジニアの技術ブログ：Software engineer tech blog – Page 11 – 随机应变 ABCD: Always Be Coding and … : хороший

音声解析をバックエンドからレスポンスする処理

@app.route("/avatar", methods=["POST"])
def avator_response():
    data = request.json
    user_text = data.get("text", "")

    # 1. Creating a reply using ChatGPT etc. (This is a simple example)
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful avatar assistant."},
            {"role": "user", "content": user_text}
        ]
    )
    reply_text = completion.choices[0].message.content

    # 2. TTS speech generation
    tts = gTTS(text=reply_text, lang='ja')
    audio_filename = os.path.join(AUDIO_DIR, f"output_{uuid.uuid4().hex}.mp3")
    tts.save(audio_filename)

    # 3. Convert MP3 to WAV using pydub
    wav_filename = audio_filename.replace(".mp3", ".wav")
    sound = AudioSegment.from_mp3(audio_filename)
    sound.export(wav_filename, format="wav")

    # 4. Audio data analysis (amplitude calculation for lip-syncing)
    audio = AudioSegment.from_wav(wav_filename)
    samples = np.array(audio.get_array_of_samples())
    if audio.channels == 2:
        samples = samples.reshape((-1, 2)).mean(axis=1)
    samples = samples / np.max(np.abs(samples))  # Normalize

    # Frame-by-frame amplitude sampling
    fps = 30
    samples_per_frame = int(audio.frame_rate / fps)
    lip_sync = []
    for i in range(0, len(samples), samples_per_frame):
        frame = samples[i:i + samples_per_frame]
        lip_sync.append(float(np.abs(frame).mean()))

    # 5. Return to client
    response_data = {
        "text": reply_text,
        "audio_url": url_for("get_audio", filename=os.path.basename(wav_filename), _external=True),
        "lip_sync": lip_sync
    }

    return Response(json.dumps(response_data, ensure_ascii=False), mimetype="application/json")

@app.route("/audio/<filename>", methods=["GET"])
def get_audio(filename):
    file_path = os.path.join(AUDIO_DIR, filename)
    return send_file(file_path, mimetype="audio/wav")

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)

$ curl -X POST http://127.0.0.1:5000/avatar \
-H “Content-Type: application/json” \
-d ‘{“text”: “おはよう！”}’
{“text”: “おはようございます！お困りのことはありますか？”, “audio_url”: “http://127.0.0.1:5000/audio/output_8b9b617f9ea44b4e9382d99069279c2b.wav”, “lip_sync”: [0.0, 5.401201227152919e-08, 0.008150034567687852, 0.1152054616946809, 0.12006854124357258, 0.08367767791556842, 0.026896253726828846, 0.01888351769433522, 0.07841339713952383, 0.2201013265350214, 0.2508166616255455, 0.2270837834334356, 0.18286134036209653, 0.12693546644773795, 0.19306745020092467, 0.2823540595428423, 0.26987355787927236, 0.30742827204770345, 0.33021129499200624, 0.3036520222097394, 0.13783822322084432, 0.053725370522404184, 0.23884381886531564, 0.26545121635051633, 0.16945415460398394, 0.04699428552910167, 0.037515015339411484, 0.22347993993864235, 0.2646327183165536, 0.22138405781445794, 0.19320739532472023, 0.20940100678390874, 0.23490348053407076, 0.22536436503478374, 0.21555653977444586, 0.14586462429244265, 0.14904603983926024, 0.13877635786198853, 0.08746219159140994, 0.02229572656958908, 0.01466869031672644, 0.010831244868858834, 0.008575973296461132, 0.0059669770556971865, 0.002284438059024327, 0.0007382901957395324, 0.0006873028561552089, 0.00023819297411744372, 0.00011202091345115153, 0.00011088666119344939, 0.00011083264918117787, 0.00014950524996759277, 0.054377403534546086, 0.15464892192023505, 0.11003192109925247, 0.01149159573089055, 0.0017083999481484681, 0.01060147776865575, 0.21648749081795793, 0.2837956941623817, 0.2357313766581688, 0.295791492027827, 0.24570480274813122, 0.2426950913883248, 0.22178412478935314, 0.1279997191375362, 0.1125969515620274, 0.18296455731754743, 0.2677368966858229, 0.30668674113122757, 0.252059802099987, 0.22629239942963317, 0.24090750983018622, 0.0999186038975068, 0.005041211165363177, 0.01174237350386726, 0.16595396016073974, 0.22518860994685216, 0.04122672082271097, 0.002156267553903988, 0.054671390917340024, 0.2686635267683533, 0.24548022080110612, 0.2177663332325109, 0.16169052197208658, 0.25034897161128633, 0.2103575595212375, 0.17521005271572399, 0.15601337337423843, 0.12766689711791904, 0.1107986756254591, 0.047134932809056736, 0.08557376960636046, 0.11917485848852785, 0.14922184893920407, 0.17545402497515447, 0.15926343818865316, 0.14388913494361147, 0.15382718316553604, 0.08909978179147043, 0.019018493713001773, 0.022057209523398003, 0.019663235103487015, 0.0030874346454651514, 0.0014317504212936955, 0.009182042086159962, 0.0501337337423843, 0.07244177505077129, 0.0849778010629564, 0.06556064468737847, 0.044696560515058555, 0.017215464719353583, 0.0009286285269844013, 0.0002996046320701724, 0.00023268374886574773, 0.00010975240893574729, 0.00011115672125480706, 0.00011115672125480704, 0.00011056258911982022, 0.00011158881735297929, 0.00011050857710754869, 0.00011126474527935009, 0.00011148079332843622, 0.00011083264918117787, 0.00011083264918117786, 7.388843278745191e-05]}

HeyGenのようにアバターが回答する仕組み

テキストベースのチャットbotとは異なり、複数の技術が組み合わさっている。

### アバターが回答する仕組み（HeyGenのようなシステム）
1. 音声合成(Text-to-Speech, TTS)… Google Cloud Text-to-Speech, Amazon Polly
ユーザのテキスト入力を音声に変換する技術

2. 顔の動きや表情の生成
アバターの顔の動きや表情を生成するために、以下の技術が使用される
– 3Dモデリングとアニメーション: アバターの3Dモデルを作成し、表情や動きをアニメーションで表現 Unity, Unreal Engine
– フェイシャルキャプチャ: ユーザの表情をリアルタイムでキャプチャし、それをアバターに反映させる

3. 音声とアニメーションの同期
生成した音声とアニメーションを再生

### gTTSによる簡単な音声合成(Text-to-Speech, TTS)
$ pip3 install gTTS

from gtts import gTTS
import os

text = "こんにちは、私はAIアバターです!"

tts = gTTS(text=text, lang='ja')

tts.save("output.mp3")
# os.system("start output.mp3")  # For Windows

### 口パク同期処理
$ pip install gTTS pygame pydub numpy
gTTS -> 音声生成
pygame -> 口パクの可視化(簡易アバター表示)
pydub -> 音声の振幅解析
numpy -> 音声データ処理

from gtts import gTTS
from pydub import AudioSegment
import numpy as np
import pygame
import os

# --- 1. 音声生成 ---
text = "こんにちは、私はAIアバターです！"
tts = gTTS(text=text, lang='ja')
tts.save("output.mp3")

# mp3 → wav に変換（pydubで扱いやすくするため）
sound = AudioSegment.from_mp3("output.mp3")
sound.export("output.wav", format="wav")

# --- 2. 音声データを読み込む ---
audio = AudioSegment.from_wav("output.wav")
samples = np.array(audio.get_array_of_samples())
# モノラルに変換（ステレオの場合）
if audio.channels == 2:
    samples = samples.reshape((-1, 2))
    samples = samples.mean(axis=1)
# 振幅を正規化
samples = samples / np.max(np.abs(samples))

# --- 3. Pygameで口パク表示 ---
pygame.init()
screen = pygame.display.set_mode((400, 400))
pygame.display.set_caption("口パクアバター")

# 音声再生開始
os.system("start output.wav")  # macOS: afplay output.wav / Linux: aplay output.wav

clock = pygame.time.Clock()
running = True
idx = 0
sample_rate = audio.frame_rate
fps = 30  # 1秒あたりのフレーム数
samples_per_frame = int(sample_rate / fps)

while running and idx < len(samples):
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False
    
    # 現フレームの振幅を計算
    frame = samples[idx:idx+samples_per_frame]
    amplitude = np.abs(frame).mean()
    
    # 口の高さを振幅に応じて変化
    mouth_height = int(50 + amplitude * 200)
    
    # 背景
    screen.fill((255, 255, 255))
    # 顔（円）
    pygame.draw.circle(screen, (255, 224, 189), (200, 200), 100)
    # 口（長方形）
    pygame.draw.rect(screen, (150, 0, 0), (150, 250, 100, mouth_height))
    
    pygame.display.flip()
    
    idx += samples_per_frame
    clock.tick(fps)

pygame.quit()

### フロント(X-code, AndroidStudio)とバックエンド(Python)の切り分け
##### バックエンドで担当する処理
1. ユーザー入力の受け取り
アプリから送られてくるテキストメッセージを受信
例: /chat エンドポイントで JSON 受け取り

2. AI応答生成
ChatGPT APIなどを呼び出して返信テキストを生成
例: “こんにちは！今日はどんなことを話しましょうか？”

3. 音声生成（Text-to-Speech, TTS）
生成したテキストを音声データに変換
gTTS, OpenAI TTS, Coqui TTS など
出力形式は MP3/WAV など

4. 音声解析（口パク用振幅解析）
音声データを読み込んでフレームごとの振幅を算出
numpy や pydub で RMS / 平均振幅を計算
口パクアニメーションの高さや動きの指標として返す

5. バックエンドからクライアントへの送信

{
  "text": "こんにちは！",
  "audio_url": "https://server/output.wav",
  "lip_sync": [0.1, 0.2, 0.3, ...]  # フレームごとの振幅データ
}

##### クライアント（Android / iOS）で担当する処理
1. ユーザー入力の送信
テキストをバックエンドに送る
HTTP POST / WebSocket

2. 受信したデータの処理
テキスト表示
音声データの再生
Android: MediaPlayer / ExoPlayer
iOS: AVAudioPlayer
口パクデータの再生（振幅に応じてアバターの口を動かす）

3. 口パクアニメーション
受信した lip_sync 配列をフレーム単位で参照
Unity / SceneKit / SpriteKit などでアバターの口の高さや形を変化させる

バックエンドは「音声と口パクデータの生成」まで
実際の描画や音声再生はアプリ側で行う

なるほど、この仕組みは凄い！

chatバックエンド側(Python (Flask + PostgreSQL + OpenAI))の作り込み1

chat() の中で　
認証チェックを追加　
DBにユーザーの入力を保存　
OpenAI API や HuggingFace API を呼んで応答を生成　
生成した応答をDBに保存して返却

### step.0 テーブル作成

CREATE TABLE chat_messages (
    id SERIAL PRIMARY KEY,
    user_id VARCHAR(50) NOT NULL,
    message TEXT NOT NULL,
    reply TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

### step.1 パッケージインストール
$ install flask psycopg2-binary openai

### step.2 Python (Flask + PostgreSQL + OpenAI)

from flask import Flask, request, Response
import json
import psycopg2
import os
from openai import OpenAI

app = Flask(__name__)

DB_CONFIG = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST", "localhost"),
    "port": os.getenv("DB_PORT", 5432)
}


def get_db_connection():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        return conn
    except Exception as e:
        print(f"Database connection error: {e}")
        return None

@app.route("/chat", methods=["POST"])
def chat():
    try:
        data = request.get_json()

        user_id = data.get("user_id")
        message = data.get("message")

        if not user_id or not message:
            return Response(json.dumps({"error": "user_id and message are required"}), status=400, content_type="application/json; charset=utf-8")

        conn = get_db_connection()
        cur = conn.cursor()

        cur.execute(
            "INSERT INTO chat_messages (user_id, message) VALUES (%s, %s) RETURNING id;",
            (user_id, message)
        )
        chat_id = cur.fetchone()[0]

        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message}
            ]
        )
        api_reply = completion.choices[0].message.content

        cur.execute(
            "UPDATE chat_messages SET reply = %s WHERE id = %s;",
            (api_reply, chat_id)
        )
        conn.commit()
        cur.close()
        conn.close()
        

        response_json = json.dumps({"reply": api_reply}, ensure_ascii=False)
        return Response(response_json, content_type="application/json; charset=utf-8")

    except Exception as e:
        error_json = json.dumps({"error": str(e)}, ensure_ascii=False)
        return Response(error_json, status=400, content_type="application/json; charset=utf-8")


if __name__ == "__main__":
    app.run(debug=True)

$ curl -X POST http://127.0.0.1:5000/chat -H “Content-Type: application/json” -d ‘{“user_id”: “12345”, “message”: “おはよう！”}’
{“reply”: “おはようございます！元気ですか？何かお手伝いできることがありますか？”}

おおお

flaskでバックエンドを構築する

フロントエンドからデータを受け取るところを記述する
jsonをpostすると、jsonのレスポンスが返ってくる

from flask import Flask, request, Response
import json

app = Flask(__name__)

@app.route("/chat", methods=["POST"])
def chat():
    try:
        data = request.get_json()

        user_id = data.get("user_id")
        message = data.get("message")

        # ここで認証チェック、DBアクセス、AIモデルの呼び出しなどを行う
        # ここではダミーの応答を返す

        ai_reply = f"ユーザ({user_id})のメッセージを受け取りました: {message}"

        response_json = json.dumps({"reply": ai_reply}, ensure_ascii=False)
        return Response(response_json, content_type="application/json; charset=utf-8")

    except Exception as e:
        error_json = json.dumps({"error": str(e)}, ensure_ascii=False)
        return Response(error_json, status=400, content_type="application/json; charset=utf-8")


if __name__ == "__main__":
    app.run(debug=True)

$ curl -X POST http://127.0.0.1:5000/chat -H “Content-Type: application/json” -d ‘{“user_id”: “12345”, “message”: “おはよう！”}’
{“reply”: “ユーザ(12345)のメッセージを受け取りました: おはよう！”}

この chat() の中で以下も書いていく。
– 認証チェックを追加　
– DBにユーザーの入力を保存　
– OpenAI API や HuggingFace API を呼んで応答を生成　
– 生成した応答をDBに保存して返却

ChatGPT-4oとGPT5の違い

ChatGTP-4oがマルチモーダルAI実用化であったのに対し、ChatGPT5はは「思考するAI」「動けるAI」へ進化

#### ChatGPT-4o
– モデル構造 : テキスト・音声・画像を統合した「マルチモーダルモデル」
– モデルの選択 :　手動選択(GPT-4o, o3など)
– 応答速度(音声) : 平均320ms, 自然な会話速度
– 記憶機能(メモリ) : 一部記憶あり
– 外部連携機能 :　なし

#### ChatGPT-5
– モデル構造 : 複数の思考エンジンを自動で切り替える「統合システム+動的推論」
– モデルの選択 : 自動判定(Base/Thinking/Proにルーティング)
– 応答速度(音声) :　音声は4oベース、今後統合予定
– 記憶機能(メモリ) : 永続メモリ。過去の会話やユーザの好みを保持
– 外部連携機能 : Gmail、カレンダー、Drive等と直接連携可能

プロンプトの内容や難易度を自動解析し、最適な推論モデル(高速応答/深層思考)を選択
AIの性格を選べる

ソフトウェア開発、数学、多言語コード編集、マルチモーダル理解などあらゆる指標で賢くなった
プランはFree, Plus, Pro, Enterpriseなどあり

思考の質が自動最適化され、過去を覚える

Pythonで入力中の続きのコード生成AIをAPIで試す

フィボナッチ関数について、途中まで書いていて、その続きのコードをレスポンスとして返します。
ソースコードをapiに私、”complete this code”と指示を出しています。

from openai import OpenAI

client = OpenAI(api_key="sk-hoge****")

partial_code = """
def fibonacci(n):
    \"\"\"n番目までのフィボナッチ数列を返す関数.\"\"\"
    sequence = [0, 1]
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are an expert Python programmer. Complete the given code without explanations, only code."},
        {"role": "user", "content": f"Complete this Python code:\n\n{partial_code}"}
    ],
    temperature=0.2,
)

print(response.choices[0].message.content)

$ python3 codegen_input.py
“`python
if n <= 0: return [] elif n == 1: return [0] elif n == 2: return sequence for i in range(2, n): next_value = sequence[-1] + sequence[-2] sequence.append(next_value) return sequence ``` Github Copilotではコメントからコード生成はグレーアウトされたコードで即時表示している。これは特定の実行ボタンなどを押下しなくても表示しているので、Ajaxなど非同期で実行していると思われる。入力中の続きのコード生成も同様コメントを書き終えた時や、コードを打ち終えた時などにトリガーが発動される。

Pythonでコード型の生成AIをAPIで試す

from openai import OpenAI

client = OpenAI(api_key="sk-hoge****")

prompt = """
# Pythonで、与えられた文字列を逆順にして返す関数を作ってください
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that writes clean Python code.."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.2,
)

print(response.choices[0].message.content)

$ python3 codegen.py
もちろんです！以下は、与えられた文字列を逆順にして返すPythonの関数です。

“`python
def reverse_string(s):
return s[::-1]

# 使用例
input_string = “こんにちは”
reversed_string = reverse_string(input_string)
print(reversed_string) # 出力: はこんに
“`

この関数 `reverse_string` は、スライスを使って文字列を逆順にしています。`s[::-1]` は、文字列 `s` の全ての文字を逆順に取得します。

model, message, tempratureを指定してリクエストしていることがわかります。
なるほど

【Cisco Packet Tracer】ルーターを使ったVLAN間ルーティング

ルーターを使ったVLAN間ルーティング（Router-on-a-Stick構成）は、VLANで分離されたネットワーク同士を通信可能にする方法

switchの設定

Switch> enable
Switch# configure terminal

! VLANの作成
Switch(config)# vlan 10
Switch(config-vlan)# name VLAN10
Switch(config-vlan)# exit

Switch(config)# vlan 20
Switch(config-vlan)# name VLAN20
Switch(config-vlan)# exit

! ポートにVLANを割り当て
Switch(config)# interface range fa0/1 - 2
Switch(config-if-range)# switchport mode access
Switch(config-if-range)# switchport access vlan 10
Switch(config-if-range)# exit

Switch(config)# interface range fa0/3 - 4
Switch(config-if-range)# switchport mode access
Switch(config-if-range)# switchport access vlan 20
Switch(config-if-range)# exit

! ルーターに接続するポートをトランクに設定
Switch(config)# interface fa0/5
Switch(config-if)# switchport mode trunk
Switch(config-if)# exit

ルータの設定

Router> enable
Router# configure terminal

! サブインターフェース作成 VLAN10
Router(config)# interface g0/0.10
Router(config-subif)# encapsulation dot1Q 10
Router(config-subif)# ip address 192.168.10.1 255.255.255.0
Router(config-subif)# exit

! サブインターフェース作成 VLAN20
Router(config)# interface g0/0.20
Router(config-subif)# encapsulation dot1Q 20
Router(config-subif)# ip address 192.168.20.1 255.255.255.0
Router(config-subif)# exit

! 物理インターフェースを有効化
Router(config)# interface g0/0
Router(config-if)# no shutdown
Router(config-if)# exit

pc0からVLANで分離されたpc2に接続できるようになる。
必要な通信は許可するというような時に活用する。

つまり、switch単体ではVLANを分離できるが、そのVLANを接続させるなどの制御にはルーターが必要

【Cisco Packet Tracer】VLANの設定

VLAN（Virtual LAN、仮想LAN）とは、物理的には同じネットワークに接続されている機器同士を、論理的に分割して異なるネットワークのように扱う技術です。

通常、同じスイッチ（L2スイッチ）に接続されたPCやサーバーは、すべて同じネットワーク（ブロードキャストドメイン）に属します。しかし、VLANを使うことで、同じスイッチ内でも異なるVLANに分けることができ、それぞれのVLANは独立したネットワークとして動作します。

Switch> enable
Switch# configure terminal

! VLANを作成
Switch(config)# vlan 10
Switch(config-vlan)# name VLAN10
Switch(config-vlan)# exit

Switch(config)# vlan 20
Switch(config-vlan)# name VLAN20
Switch(config-vlan)# exit

! ポートをVLANに割り当て
Switch(config)# interface range fa0/1 - 2
Switch(config-if-range)# switchport mode access
Switch(config-if-range)# switchport access vlan 10
Switch(config-if-range)# exit

Switch(config)# interface range fa0/3 - 4
Switch(config-if-range)# switchport mode access
Switch(config-if-range)# switchport access vlan 20
Switch(config-if-range)# exit

Switch(config)# exit
Switch#

comand promptからpingでpc2, pc3に疎通確認

なるほど、スイッチでネットワークを分離するのね。

【Azure】BlobStorageへPythonでアップロードする

$ install azure-storage-blob

Azureのストレージアカウントのセキュリティ&ネットワークのアクセスキーから接続文字列を取得する(キーの方ではない)

from azure.storage.blob import BlobServiceClient, ContentSettings

connection_string = "***"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_name = "images"
blob_name = "cat.jpg"
local_file_path = "cat.jpg"

container_cliient = blob_service_client.get_container_client(container_name)

with open(local_file_path, "rb") as data:
    container_cliient.upload_blob(
        name=blob_name,
        data=data,
        overwrite=True,
        content_settings=ContentSettings(content_type="image/jpeg")
    )

print(f"Blob '{blob_name}' uploaded to container '{container_name}' successfully.")

おおお、アップロードできるようになると、一段別のステージに行った感がありますね。
次はAzureSQLにも触っておきたい。