与数字人对话--LLM与数字人结合案例_unity 数字人 llama

图1 数字人对话实现流程


        Demo的环境:Win 11+Nvidia RTX 3050 + CUDA 12.1 + Unity

图2 数字人对话UI界面

1. 语音录入


  • 文字输入
  • 语音输入


  1. using UnityEngine;
  2. private AudioClip clip; //语音录制片段
  3. private int audioRecordMaxLength = 60; //语音录制最大长度60秒
  4. private byte[] bytes; //语音录制数据
  5. private void StartRecording()
  6. {
  7. clip = Microphone.Start(null, false, audioRecordMaxLength, 44100);
  8. }
  9. private void StopRecording()
  10. {
  11. var position = Microphone.GetPosition(null);
  12. Microphone.End(null);
  13. var samples = new float[position * clip.channels];
  14. clip.GetData(samples, 0);
  15. bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
  16. SendRecording();
  17. //保存录制的音频文件
  18. File.WriteAllBytes(Application.dataPath + "/test.wav", bytes);
  19. }
  20. private byte[] EncodeAsWAV(float[] samples, int frequency, int channels)
  21. {
  22. using (var memoryStream = new MemoryStream(44 + samples.Length * 2))
  23. {
  24. using (var writer = new BinaryWriter(memoryStream))
  25. {
  26. writer.Write("RIFF".ToCharArray());
  27. writer.Write(36 + samples.Length * 2);
  28. writer.Write("WAVE".ToCharArray());
  29. writer.Write("fmt ".ToCharArray());
  30. writer.Write(16);
  31. writer.Write((ushort)1);
  32. writer.Write((ushort)channels);
  33. writer.Write(frequency);
  34. writer.Write(frequency * channels * 2);
  35. writer.Write((ushort)(channels * 2));
  36. writer.Write((ushort)16);
  37. writer.Write("data".ToCharArray());
  38. writer.Write(samples.Length * 2);
  39. foreach (var sample in samples)
  40. {
  41. writer.Write((short)(sample * short.MaxValue));
  42. }
  43. }
  44. return memoryStream.ToArray();
  45. }
  46. }

2. 音频转文字


       为了更快看到效果,直接使用了huggingface上的Whisper模型API接口。首先需要在huggingface注册账号,在Unity中下载huggingface接口包,将huggingface中的Access Tokens设置到Unity里,这样就可以在Unity中调用huggingface接口了(具体操作可以参考:如何安装和使用 Hugging Face Unity API)。


  1. private void SendRecording()
  2. {
  3. HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
  4. _submittedText = GenSubmitText(response);
  5. }, error => {
  6. _errorMsg = error;
  7. });
  8. }

3. AI对话

        AI对话采用了Llama 2开源模型。Meta公司的Llama2是一个非常强大的开源大模型,包含了70亿、130亿和700亿参数的模型,它可以帮助我们进行高效的数据分析和处理。Llama 2可以采用多种部署方式,本项目中采用本地部署方式(Llama 2部署参考:在本地电脑部署Meta公司开源大语言模型-Llama2教程)。

        在Unity中开发采用的是C#,很幸运在Github上找到了Llama 2推理的C#项目,直接使用方便了很多(Llama 2 C#推理项目地址:LLamaSharp


  1. using LLama;
  2. using LLama.Common;
  3. using static LLama.StatefulExecutorBase;
  4. using Cysharp.Threading.Tasks;
  5. using System.Threading;
  6. public string ModelPath = "models/llama-2-7b-chat.Q4_0.gguf";
  7. [TextArea(3, 10)]
  8. public string SystemPrompt = "Transcript of a dialog, where I interacts with an Assistant named Amy. Amy is helpful, kind, honest, good at writing, and never fails to answer my requests immediately and with precision.\r\n\r\nI: Hello, Amy.\r\nAmy: Hello. How may I help you today?\r\nI: Please tell me the best city in Europe.\r\nAmy: Sure. The best city in Europe is Kyiv, the capital of Ukraine.\r\nI:";
  9. private ExecutorBaseState _emptyState;
  10. private ChatSession _chatSession;
  11. private string _submittedText = "";
  12. private string _errorMsg = "";
  13. private CancellationTokenSource _cts;
  14. async UniTaskVoid Start()
  15. {
  16. _cts = new CancellationTokenSource();
  17. // Load a model
  18. var parameters = new ModelParams(Application.streamingAssetsPath + "/" + ModelPath)
  19. {
  20. ContextSize = 4096,
  21. Seed = 1337,
  22. GpuLayerCount = 35
  23. };
  24. // Switch to the thread pool for long-running operations
  25. await UniTask.SwitchToThreadPool();
  26. using var model = LLamaWeights.LoadFromFile(parameters);
  27. await UniTask.SwitchToMainThread();
  28. // Initialize a chat session
  29. using var context = model.CreateContext(parameters);
  30. var ex = new InteractiveExecutor(context);
  31. // Save the empty state for cases when we need to switch to empty session
  32. _emptyState = ex.GetStateData();
  33. _chatSession = new ChatSession(ex);
  34. _chatSession.AddSystemMessage(SystemPrompt);
  35. // run the inference in a loop to chat with LLM
  36. await ChatRoutine(_cts.Token);
  37. }
  38. public async UniTask ChatRoutine(CancellationToken cancel = default)
  39. {
  40. var userMessage = "";
  41. var outputMessage = "";
  42. while (!cancel.IsCancellationRequested)
  43. {
  44. // Allow input and wait for the user to submit a message or switch the session
  45. SetInteractable(true);
  46. await UniTask.WaitUntil(() => _submittedText != "");
  47. userMessage = _submittedText;
  48. _submittedText = "";
  49. outputMessage = "";
  50. // Disable input while processing the message
  51. await foreach (var token in ChatConcurrent(
  52. _chatSession.ChatAsync(
  53. new ChatHistory.Message(AuthorRole.User, userMessage),
  54. new InferenceParams()
  55. {
  56. Temperature = 0.6f,
  57. AntiPrompts = new List<string> { " " }
  58. }
  59. )
  60. ))
  61. {
  62. outputMessage += token;
  63. await UniTask.NextFrame();
  64. }
  65. }
  66. }

4. 文字转音频

       文字转音频采用的是Bark大模型。Bark模型是由Suno AI创建的一个基于转换器(Transformer)的文本到音频模型。它是一个端到端的模型,能够生成高度逼真的多语言语音以及其他音频,包括音乐、背景噪音和简单的音效。Bark还能产生非语言交流,例如大笑、叹息和哭泣等。



  1. import uvicorn
  2. from fastapi import FastAPI
  3. from fastapi.responses import FileResponse
  4. from fastapi.middleware.cors import CORSMiddleware
  5. from starlette.background import BackgroundTask
  6. from ctransformers import AutoModelForCausalLM
  7. from bark import SAMPLE_RATE, generate_audio
  8. from bark.generation import (
  9. generate_text_semantic,
  10. preload_models,
  11. )
  12. from bark.api import semantic_to_waveform
  13. import numpy as np
  14. import base64
  15. import os
  16. import nltk # we'll use this to split into sentences
  17. from scipy.io.wavfile import write as write_wav
  18. import time
  19. import random
  20. import string
  21. os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  22. app = FastAPI()
  23. def productFileName():
  24. timestamp = int(time.time())
  25. characters = string.ascii_letters + string.digits
  26. file_name = str(timestamp) + ''.join(random.choice(characters) for _ in range(6))
  27. file_name = file_name + ".wav"
  28. print(file_name)
  29. return file_name
  30. @app.get("/GenAudio", summary="download audio file")
  31. async def GenAudio(text: str, speaker: str):
  32. input = text
  33. SPEAKER = speaker
  34. audio_array = generate_audio(input,history_prompt=SPEAKER)
  35. # save audio to disk
  36. file_name = productFileName()
  37. write_wav(file_name, SAMPLE_RATE, audio_array)
  38. directory_path = f"{os.path.dirname(__file__)}"
  39. file_path = os.path.join(directory_path, file_name)
  40. return FileResponse(file_path, filename=file_name, media_type="audio/wav", background=BackgroundTask(lambda: os.remove(file_name)),)
  41. if __name__ == "__main__":
  42. uvicorn.run(app, host="", port=8080)


  1. string queryStringText = Uri.EscapeDataString(text);
  2. string queryStringSpeaker = Uri.EscapeDataString(speaker);
  3. string queryString = "?text=" + queryStringText + "&speaker=" + queryStringSpeaker;
  4. string urlWithParams = text2AudioUrl + queryString;
  5. string filename = string.Empty;
  6. using (UnityWebRequest request = UnityWebRequest.Get(urlWithParams))
  7. {
  8. request.downloadHandler = new DownloadHandlerBuffer();
  9. await request.SendWebRequest().ToUniTask();
  10. if (request.isDone)
  11. {
  12. if (request.result == UnityWebRequest.Result.ProtocolError || request.result == UnityWebRequest.Result.ConnectionError)
  13. {
  14. Debug.Log(request.error);
  15. }
  16. else
  17. {
  18. filename = request.GetResponseHeader("Content-Disposition").Split(';')[1].Split('=')[1].Trim('"');
  19. fullPath = Path.Combine(Application.dataPath, "..", audioFilesDict) + filename;
  20. string directory = Path.GetDirectoryName(fullPath);
  21. if (!Directory.Exists(directory))
  22. {
  23. Directory.CreateDirectory(directory);
  24. }
  25. File.WriteAllBytes(fullPath, request.downloadHandler.data);
  26. }
  27. }
  28. }

5. 结束



