@app.route("/avatar", methods=["POST"]) def avator_response(): data = request.json user_text = data.get("text", "") # 1. Creating a reply using ChatGPT etc. (This is a simple example) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful avatar assistant."}, {"role": "user", "content": user_text} ] ) reply_text = completion.choices[0].message.content # 2. TTS speech generation tts = gTTS(text=reply_text, lang='ja') audio_filename = os.path.join(AUDIO_DIR, f"output_{uuid.uuid4().hex}.mp3") tts.save(audio_filename) # 3. Convert MP3 to WAV using pydub wav_filename = audio_filename.replace(".mp3", ".wav") sound = AudioSegment.from_mp3(audio_filename) sound.export(wav_filename, format="wav") # 4. Audio data analysis (amplitude calculation for lip-syncing) audio = AudioSegment.from_wav(wav_filename) samples = np.array(audio.get_array_of_samples()) if audio.channels == 2: samples = samples.reshape((-1, 2)).mean(axis=1) samples = samples / np.max(np.abs(samples)) # Normalize # Frame-by-frame amplitude sampling fps = 30 samples_per_frame = int(audio.frame_rate / fps) lip_sync = [] for i in range(0, len(samples), samples_per_frame): frame = samples[i:i + samples_per_frame] lip_sync.append(float(np.abs(frame).mean())) # 5. Return to client response_data = { "text": reply_text, "audio_url": url_for("get_audio", filename=os.path.basename(wav_filename), _external=True), "lip_sync": lip_sync } return Response(json.dumps(response_data, ensure_ascii=False), mimetype="application/json") @app.route("/audio/<filename>", methods=["GET"]) def get_audio(filename): file_path = os.path.join(AUDIO_DIR, filename) return send_file(file_path, mimetype="audio/wav") if __name__ == "__main__": app.run(host="0.0.0.0", port=5000, debug=True)
$ curl -X POST http://127.0.0.1:5000/avatar \
-H “Content-Type: application/json” \
-d ‘{“text”: “おはよう!”}’
{“text”: “おはようございます!お困りのことはありますか?”, “audio_url”: “http://127.0.0.1:5000/audio/output_8b9b617f9ea44b4e9382d99069279c2b.wav”, “lip_sync”: [0.0, 5.401201227152919e-08, 0.008150034567687852, 0.1152054616946809, 0.12006854124357258, 0.08367767791556842, 0.026896253726828846, 0.01888351769433522, 0.07841339713952383, 0.2201013265350214, 0.2508166616255455, 0.2270837834334356, 0.18286134036209653, 0.12693546644773795, 0.19306745020092467, 0.2823540595428423, 0.26987355787927236, 0.30742827204770345, 0.33021129499200624, 0.3036520222097394, 0.13783822322084432, 0.053725370522404184, 0.23884381886531564, 0.26545121635051633, 0.16945415460398394, 0.04699428552910167, 0.037515015339411484, 0.22347993993864235, 0.2646327183165536, 0.22138405781445794, 0.19320739532472023, 0.20940100678390874, 0.23490348053407076, 0.22536436503478374, 0.21555653977444586, 0.14586462429244265, 0.14904603983926024, 0.13877635786198853, 0.08746219159140994, 0.02229572656958908, 0.01466869031672644, 0.010831244868858834, 0.008575973296461132, 0.0059669770556971865, 0.002284438059024327, 0.0007382901957395324, 0.0006873028561552089, 0.00023819297411744372, 0.00011202091345115153, 0.00011088666119344939, 0.00011083264918117787, 0.00014950524996759277, 0.054377403534546086, 0.15464892192023505, 0.11003192109925247, 0.01149159573089055, 0.0017083999481484681, 0.01060147776865575, 0.21648749081795793, 0.2837956941623817, 0.2357313766581688, 0.295791492027827, 0.24570480274813122, 0.2426950913883248, 0.22178412478935314, 0.1279997191375362, 0.1125969515620274, 0.18296455731754743, 0.2677368966858229, 0.30668674113122757, 0.252059802099987, 0.22629239942963317, 0.24090750983018622, 0.0999186038975068, 0.005041211165363177, 0.01174237350386726, 0.16595396016073974, 0.22518860994685216, 0.04122672082271097, 0.002156267553903988, 0.054671390917340024, 0.2686635267683533, 0.24548022080110612, 0.2177663332325109, 0.16169052197208658, 0.25034897161128633, 0.2103575595212375, 0.17521005271572399, 0.15601337337423843, 0.12766689711791904, 0.1107986756254591, 0.047134932809056736, 0.08557376960636046, 0.11917485848852785, 0.14922184893920407, 0.17545402497515447, 0.15926343818865316, 0.14388913494361147, 0.15382718316553604, 0.08909978179147043, 0.019018493713001773, 0.022057209523398003, 0.019663235103487015, 0.0030874346454651514, 0.0014317504212936955, 0.009182042086159962, 0.0501337337423843, 0.07244177505077129, 0.0849778010629564, 0.06556064468737847, 0.044696560515058555, 0.017215464719353583, 0.0009286285269844013, 0.0002996046320701724, 0.00023268374886574773, 0.00010975240893574729, 0.00011115672125480706, 0.00011115672125480704, 0.00011056258911982022, 0.00011158881735297929, 0.00011050857710754869, 0.00011126474527935009, 0.00011148079332843622, 0.00011083264918117787, 0.00011083264918117786, 7.388843278745191e-05]}