1. tts
使用sherpa的参考代码如下
import soundfile as sf
import sherpa_onnx
def write(text,output_filename,sid=10,provider='cpu'):tts_config = sherpa_onnx.OfflineTtsConfig(model=sherpa_onnx.OfflineTtsModelConfig(vits=sherpa_onnx.OfflineTtsVitsModelConfig(model='vits-aishell3.onnx',lexicon='lexicon.txt',tokens='tokens.txt',),provider=provider),max_num_sentences=2,)audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid)sf.write(output_filename,audio.samples,samplerate=audio.sample_rate,subtype="PCM_16",)
使用edge-tts(微软家的,需要联网)的示例代码如下:
#import edge_tts,os,asyncio,sys
import sys
from pydub import AudioSegment,playback
#async def read(text):
# tts = edge_tts.Communicate(text=text, voice='zh-CN-YunxiNeural',rate = '+5%')
# if 'temp.mp3' in os.listdir('.'):
# os.system("rm temp.mp3")
# await tts.save(text+".mp3")#asyncio.run(read(sys.argv[1]))
playback.play(AudioSegment.from_mp3(sys.argv[1]+'.mp3'))
2. 使用python制作智能语音助手
import soundfile as sf
import whisper,pyaudio,wave,os,warnings,time,torch,sherpa_onnx
from pydub import AudioSegment,playback
from transformers import AutoModelForCausalLM, AutoTokenizer
from autogen import OpenAIWrapper
client = OpenAIWrapper(api_key="NULL", base_url="http://localhost:2600/v1", api_type="open_ai")
warnings.filterwarnings('ignore')
model = whisper.load_model("medium")import soundfile as sf
import sherpa_onnx
def write(text,output_filename,sid=10,provider='cpu'):tts_config = sherpa_onnx.OfflineTtsConfig(model=sherpa_onnx.OfflineTtsModelConfig(vits=sherpa_onnx.OfflineTtsVitsModelConfig(model='vits-aishell3.onnx',lexicon='lexicon.txt',tokens='tokens.txt',),provider=provider),max_num_sentences=2,)audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid)sf.write(output_filename,audio.samples,samplerate=audio.sample_rate,subtype="PCM_16",)def asr(filename):os.system('ffmpeg -i %s -af silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-30dB -ac 1 -ar 16000 %s'%(filename,'trans_'+filename))res = "".join(os.popen('whisper.cpp/main -m /Users/czhang39/.cache/huggingface/hub/models--ggerganov--whisper.cpp/snapshots/d15393806e24a74f60827e23e986f0c10750b358/ggml-large-v2.bin -np -nt -l zh --prompt 你好小特,以下是普通话。 -f %s'%('trans_'+filename)).readlines()).replace('\n','')os.system('rm %s'%filename)os.system('rm %s'%('trans_'+filename))return resdef write(text,output_filename,sid=10,provider='cpu'):tts_config = sherpa_onnx.OfflineTtsConfig(model=sherpa_onnx.OfflineTtsModelConfig(vits=sherpa_onnx.OfflineTtsVitsModelConfig(model='vits-aishell3.onnx',lexicon='lexicon.txt',tokens='tokens.txt',),provider=provider),#rule_fsts=args.tts_rule_fsts,max_num_sentences=2,)if not tts_config.validate():raise ValueError("Please check your config")tts = sherpa_onnx.OfflineTts(tts_config)start = time.time()audio = tts.generate(text, sid=sid)end = time.time()if len(audio.samples) == 0:print("Error in generating audios. Please read previous error messages.")returnelapsed_seconds = end - startaudio_duration = len(audio.samples) / audio.sample_ratereal_time_factor = elapsed_seconds / audio_durationsf.write(output_filename,audio.samples,samplerate=audio.sample_rate,subtype="PCM_16",)def wakeup(seconds = 2):chunk = 1024 # Record in chunks of 1024 samplessample_format = pyaudio.paInt16 # 16 bits per samplechannels = 1fs = 44100 # Record at 44100 samples per secondfilename = "output.wav"p = pyaudio.PyAudio() # Create an interface to PortAudiostream = p.open(format=sample_format,channels=channels,rate=fs,frames_per_buffer=chunk,input=True)frames = [] for i in range(0, int(fs / chunk * seconds)):data = stream.read(chunk)frames.append(data)# Stop and close the stream stream.stop_stream()stream.close()# Terminate the PortAudio interfacep.terminate()wf = wave.open(filename, 'wb')wf.setnchannels(channels)wf.setsampwidth(p.get_sample_size(sample_format))wf.setframerate(fs)wf.writeframes(b''.join(frames))wf.close()result = model.transcribe(filename,language='zh',initial_prompt='你好, 以下是普通话')#result=asr(filename)return result['text']def read(text):write(text,'reply.wav',10)playback.play(AudioSegment.from_mp3('reply.wav'))os.environ['TOKENIZERS_PARALLELISM']='false'
activated = 0
waiting_time = 5
while 1:text = wakeup()if activated>1:if 20>len(text)>1:print(text)#reply = llm_model.chat(tokenizer,text)[0]response = client.create(messages=[{"role": "user", "content": "<用户>%s<AI>"%text}], model="guff")reply = client.extract_text_or_completion_object(response)[0]print(reply)read(reply)activated = 5else:activated-=1print(activated)elif '小特' in text:print('activated')activated = waiting_timeread('你好啊!我被唤醒了')elif activated==1:read('你好啊!再见')print('sleep')activated = 0
3. 使用C++代码
使用whisper.cpp的command代码,修改部分如下:
- 第559行,修改唤醒词:
std::string k_prompt = "自定义唤醒词";
- 第607行,增加唤醒后的处理代码:
std::system("python read.py 我在");
- 第664行开始,自定义待机/关机/活跃状态的代码:
if (command=="待机"){fprintf(stdout,"好的!");std::system("python read.py 好的");ask_prompt = true;}else if (command=="退出"){fprintf(stdout,"下次再见!");std::system("python read.py 下次再见");is_running = false;}else{char str3[strlen(command.c_str())+30];sprintf(str3, "%s%s%s", "python chat.py \"", command.c_str(),"\"");std::system(str3);}
接下来是调用tts和本地大模型的python代码: