音声解析をバックエンドからレスポンスする処理

@app.route("/avatar", methods=["POST"])
def avator_response():
    data = request.json
    user_text = data.get("text", "")

    # 1. Creating a reply using ChatGPT etc. (This is a simple example)
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful avatar assistant."},
            {"role": "user", "content": user_text}
        ]
    )
    reply_text = completion.choices[0].message.content

    # 2. TTS speech generation
    tts = gTTS(text=reply_text, lang='ja')
    audio_filename = os.path.join(AUDIO_DIR, f"output_{uuid.uuid4().hex}.mp3")
    tts.save(audio_filename)

    # 3. Convert MP3 to WAV using pydub
    wav_filename = audio_filename.replace(".mp3", ".wav")
    sound = AudioSegment.from_mp3(audio_filename)
    sound.export(wav_filename, format="wav")

    # 4. Audio data analysis (amplitude calculation for lip-syncing)
    audio = AudioSegment.from_wav(wav_filename)
    samples = np.array(audio.get_array_of_samples())
    if audio.channels == 2:
        samples = samples.reshape((-1, 2)).mean(axis=1)
    samples = samples / np.max(np.abs(samples))  # Normalize

    # Frame-by-frame amplitude sampling
    fps = 30
    samples_per_frame = int(audio.frame_rate / fps)
    lip_sync = []
    for i in range(0, len(samples), samples_per_frame):
        frame = samples[i:i + samples_per_frame]
        lip_sync.append(float(np.abs(frame).mean()))

    # 5. Return to client
    response_data = {
        "text": reply_text,
        "audio_url": url_for("get_audio", filename=os.path.basename(wav_filename), _external=True),
        "lip_sync": lip_sync
    }

    return Response(json.dumps(response_data, ensure_ascii=False), mimetype="application/json")

@app.route("/audio/<filename>", methods=["GET"])
def get_audio(filename):
    file_path = os.path.join(AUDIO_DIR, filename)
    return send_file(file_path, mimetype="audio/wav")

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)

$ curl -X POST http://127.0.0.1:5000/avatar \
-H “Content-Type: application/json” \
-d ‘{“text”: “おはよう!”}’
{“text”: “おはようございます!お困りのことはありますか?”, “audio_url”: “http://127.0.0.1:5000/audio/output_8b9b617f9ea44b4e9382d99069279c2b.wav”, “lip_sync”: [0.0, 5.401201227152919e-08, 0.008150034567687852, 0.1152054616946809, 0.12006854124357258, 0.08367767791556842, 0.026896253726828846, 0.01888351769433522, 0.07841339713952383, 0.2201013265350214, 0.2508166616255455, 0.2270837834334356, 0.18286134036209653, 0.12693546644773795, 0.19306745020092467, 0.2823540595428423, 0.26987355787927236, 0.30742827204770345, 0.33021129499200624, 0.3036520222097394, 0.13783822322084432, 0.053725370522404184, 0.23884381886531564, 0.26545121635051633, 0.16945415460398394, 0.04699428552910167, 0.037515015339411484, 0.22347993993864235, 0.2646327183165536, 0.22138405781445794, 0.19320739532472023, 0.20940100678390874, 0.23490348053407076, 0.22536436503478374, 0.21555653977444586, 0.14586462429244265, 0.14904603983926024, 0.13877635786198853, 0.08746219159140994, 0.02229572656958908, 0.01466869031672644, 0.010831244868858834, 0.008575973296461132, 0.0059669770556971865, 0.002284438059024327, 0.0007382901957395324, 0.0006873028561552089, 0.00023819297411744372, 0.00011202091345115153, 0.00011088666119344939, 0.00011083264918117787, 0.00014950524996759277, 0.054377403534546086, 0.15464892192023505, 0.11003192109925247, 0.01149159573089055, 0.0017083999481484681, 0.01060147776865575, 0.21648749081795793, 0.2837956941623817, 0.2357313766581688, 0.295791492027827, 0.24570480274813122, 0.2426950913883248, 0.22178412478935314, 0.1279997191375362, 0.1125969515620274, 0.18296455731754743, 0.2677368966858229, 0.30668674113122757, 0.252059802099987, 0.22629239942963317, 0.24090750983018622, 0.0999186038975068, 0.005041211165363177, 0.01174237350386726, 0.16595396016073974, 0.22518860994685216, 0.04122672082271097, 0.002156267553903988, 0.054671390917340024, 0.2686635267683533, 0.24548022080110612, 0.2177663332325109, 0.16169052197208658, 0.25034897161128633, 0.2103575595212375, 0.17521005271572399, 0.15601337337423843, 0.12766689711791904, 0.1107986756254591, 0.047134932809056736, 0.08557376960636046, 0.11917485848852785, 0.14922184893920407, 0.17545402497515447, 0.15926343818865316, 0.14388913494361147, 0.15382718316553604, 0.08909978179147043, 0.019018493713001773, 0.022057209523398003, 0.019663235103487015, 0.0030874346454651514, 0.0014317504212936955, 0.009182042086159962, 0.0501337337423843, 0.07244177505077129, 0.0849778010629564, 0.06556064468737847, 0.044696560515058555, 0.017215464719353583, 0.0009286285269844013, 0.0002996046320701724, 0.00023268374886574773, 0.00010975240893574729, 0.00011115672125480706, 0.00011115672125480704, 0.00011056258911982022, 0.00011158881735297929, 0.00011050857710754869, 0.00011126474527935009, 0.00011148079332843622, 0.00011083264918117787, 0.00011083264918117786, 7.388843278745191e-05]}