语音对话COOKBOOK

更新时间：2025-01-17

目标

实现一个语音对话功能，支持多种语音音色。用户可以参考cookbook代码，通过AppBuilder-SDK将语音功能很好地融入自己的平台、应用中。

实现原理

通过循环不断处理用户的语音，将语音转文本，然后进行对话，最后将对话结果通过TTS进行播报。

使用大模型的 ASR 进行语音转文本。
使用用户自己创建的Agent进行对话，适配用户的应用场景，并具有上下文理解能力。
使用大模型的 TTS 进行文本转语音并进行播报。

前置条件

使用内置ASR、TTS组件之前，请先开通组件服务 ( 短语音识别-极速版、短文本在线合成 ) 并够买额度，可参考开通组件服务。
pip安装pyaudio、webrtcvad依赖包
给程序开放麦克风权限
创建好自己的Agent应用

示例代码

 # Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import time
 import wave
 import sys
 import pyaudio
 import webrtcvad
 import appbuilder
 import re

 # 请前往千帆AppBuilder官网创建密钥，流程详见
 https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
 
 # 设置环境变量
 os.environ["APPBUILDER_TOKEN"] = (
    "..."
 )
 # 已发布AppBuilder应用的ID
 app_id = "..."
 appbuilder.logger.setLoglevel("ERROR")

 CHUNK = 1024
 FORMAT = pyaudio.paInt16
 CHANNELS = 1 if sys.platform == "darwin" else 2
 RATE = 16000
 DURATION = 30  # ms
 CHUNK = RATE // 1000 * DURATION


 class Chatbot:
    def __init__(self):
        self.p = pyaudio.PyAudio()
        self.tts = appbuilder.TTS()
        self.asr = appbuilder.ASR()
        self.agent = appbuilder.AppBuilderClient(app_id)
        self.conversation_id = self.agent.create_conversation()

    def run(self):
        self.run_tts_and_play_audio(
            "我是你的专属聊天机器人，如果你有什么问题，可以直接问我"
        )
        while True:
            # Record
            audio_path = "output.wav"
            print("开始记录音频...")
            if self.record_audio(audio_path) < 1000:
                time.sleep(1)
                continue
            print("音频记录结束")

            # ASR
            print("开始执行ASR...")
            query = self.run_asr(audio_path)
            print("结束执行ASR")

            # Agent
            print("query: ", query)
            if len(query) == 0:
                continue
            answer = self.run_agent(query)
            results = re.findall(r"(https?://[^\s]+)", answer)
            for result in results:
                print("链接地址:", result)
                answer = answer.replace(result, "")
            print("answer:", answer)

            # TTS
            print("开始执行TTS并播报...")
            self.run_tts_and_play_audio(answer)
            print("结束TTS并播报结束")

    def record_audio(self, path):
        with wave.open(path, "wb") as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(self.p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            stream = self.p.open(
                format=FORMAT, channels=CHANNELS, rate=RATE, input=True
            )
            vad = webrtcvad.Vad(1)
            not_speech_times = 0
            speech_times = 0
            total_times = 0
            start_up_times = 33 * 5  # 初始时间设置为5秒
            history_speech_times = 0
            while True:
                if history_speech_times > 33 * 10:
                    break
                data = stream.read(CHUNK, False)
                if vad.is_speech(data, RATE):
                    speech_times += 1
                    wf.writeframes(data)
                else:
                    not_speech_times += 1
                total_times += 1
                if total_times >= start_up_times:
                    history_speech_times += speech_times
                    # 模拟滑窗重新开始计数
                    if float(not_speech_times) / float(total_times) > 0.7:
                        break
                    not_speech_times = 0
                    speech_times = 0
                    total_times = 0
                    start_up_times = start_up_times / 2
                    if start_up_times < 33:
                        start_up_times = 33
            stream.close()
            return history_speech_times * DURATION

    def run_tts_and_play_audio(self, text: str):
        # AppBuilder内置的TTS使用文档，用户可根据文档调整参数：
        https://github.com/baidubce/app-builder/tree/master/python/core/components/tts
        
        msg = self.tts.run(
            appbuilder.Message(content={"text": text}),
            speed=5,
            pitch=5,
            volume=5,
            person=0,
            audio_type="pcm",
            model="paddlespeech-tts",
            stream=True,
        )
        stream = self.p.open(
            format=self.p.get_format_from_width(2),
            channels=1,
            rate=24000,
            output=True,
            frames_per_buffer=2048,
        )
        for pcm in msg.content:
            stream.write(pcm)
        stream.stop_stream()
        stream.close()

    # AppBuilder内置的ASR使用文档，用户可根据文档调整参数：
    https://github.com/baidubce/app-builder/blob/master/python/core/components/asr/README.md
    
    def run_asr(self, audio_path: str):
        with open(audio_path, "rb") as f:
            content_data = {"audio_format": "wav", "raw_audio": f.read(), "rate": 16000}
            msg = appbuilder.Message(content_data)
            out = self.asr.run(msg)
            text = out.content["result"][0]
            return text

    def run_agent(self, query):
        msg = self.agent.run(self.conversation_id, query, stream=True)
        answer = ""
        for content in msg.content:
            answer += content.answer
        return answer


 if __name__ == "__main__":
    chatbot = Chatbot()
    chatbot.run()

使用方法

直接运行程序即可。

也可以将下面的功能模块替换成自己的其他实现或模型：

record_audio: 录音
run_asr: 语音识别，AppBuilder ASR组件使用文档
run_agent: Agent对话功能。
run_tts_and_play_audio：回复的语音生成并播报。AppBuilder TTS组件使用文档

流式TTS已经上线，测试配额申请地址配额。

知识库管理

数字人应用调用COOKBOOK