diff --git a/app/client/api.ts b/app/client/api.ts index 7a242ea99dd..a3da99ba70d 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -39,6 +39,7 @@ export interface MultimodalContent { export interface RequestMessage { role: MessageRole; content: string | MultimodalContent[]; + audio_url?: string; } export interface LLMConfig { diff --git a/app/components/chat.module.scss b/app/components/chat.module.scss index 73542fc67f1..32336e95d1c 100644 --- a/app/components/chat.module.scss +++ b/app/components/chat.module.scss @@ -443,6 +443,10 @@ transition: all ease 0.3s; } +.audio-message { + min-width: 350px; +} + .chat-message-item-image { width: 100%; margin-top: 10px; @@ -471,6 +475,10 @@ border: rgba($color: #888, $alpha: 0.2) 1px solid; } +.chat-message-item-audio { + margin-top: 10px; + width: 100%; +} @media only screen and (max-width: 600px) { $calc-image-width: calc(100vw/3*2/var(--image-count)); @@ -519,7 +527,7 @@ background-color: var(--second); &:hover { - min-width: 0; + //min-width: 350px; } } @@ -693,4 +701,4 @@ .shortcut-key span { font-size: 12px; color: var(--black); -} \ No newline at end of file +} diff --git a/app/components/chat.tsx b/app/components/chat.tsx index b45d36f9587..22a470a8aed 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -117,7 +117,7 @@ import { MultimodalContent } from "../client/api"; const localStorage = safeLocalStorage(); import { ClientApi } from "../client/api"; -import { createTTSPlayer } from "../utils/audio"; +import { createTTSPlayer, arrayBufferToWav } from "../utils/audio"; import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; const ttsPlayer = createTTSPlayer(); @@ -1121,6 +1121,14 @@ function _Chat() { ); }; + const updateMessageAudio = (msgId?: string, audio_url?: string) => { + chatStore.updateCurrentSession((session) => { + session.messages = session.messages.map((m) => + m.id === msgId ? { ...m, audio_url } : m, + ); + }); + }; + const onDelete = (msgId: string) => { deleteMessage(msgId); }; @@ -1197,7 +1205,7 @@ function _Chat() { const accessStore = useAccessStore(); const [speechStatus, setSpeechStatus] = useState(false); const [speechLoading, setSpeechLoading] = useState(false); - async function openaiSpeech(text: string) { + async function openaiSpeech(text: string): Promise { if (speechStatus) { ttsPlayer.stop(); setSpeechStatus(false); @@ -1227,16 +1235,22 @@ function _Chat() { }); } setSpeechStatus(true); - ttsPlayer - .play(audioBuffer, () => { - setSpeechStatus(false); - }) - .catch((e) => { - console.error("[OpenAI Speech]", e); - showToast(prettyObject(e)); + try { + const waveFile = arrayBufferToWav(audioBuffer); + const audioFile = new Blob([waveFile], { type: "audio/wav" }); + + const audioUrl: string = await uploadImageRemote(audioFile); + await ttsPlayer.play(audioBuffer, () => { setSpeechStatus(false); - }) - .finally(() => setSpeechLoading(false)); + }); + return audioUrl; + } catch (e) { + console.error("[Speech Error]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + } finally { + setSpeechLoading(false); + } } } @@ -1793,9 +1807,12 @@ function _Chat() { ) } - onClick={() => - openaiSpeech(getMessageTextContent(message)) - } + onClick={async () => { + const url = await openaiSpeech( + getMessageTextContent(message), + ); + updateMessageAudio(message.id, url); + }} /> )} @@ -1830,7 +1847,11 @@ function _Chat() { ))} )} -
+
)} + {message.audio_url && ( + + )}
diff --git a/app/icons/play.svg b/app/icons/play.svg new file mode 100644 index 00000000000..64e3efca4bb --- /dev/null +++ b/app/icons/play.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/stop.svg b/app/icons/stop.svg new file mode 100644 index 00000000000..9037b3390d4 --- /dev/null +++ b/app/icons/stop.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/styles/globals.scss b/app/styles/globals.scss index e8c10de3f2d..3b8ded1f785 100644 --- a/app/styles/globals.scss +++ b/app/styles/globals.scss @@ -399,3 +399,13 @@ pre { .copyable { user-select: text; } + +audio { + height: 35px; +} +audio::-webkit-media-controls-play-button, +audio::-webkit-media-controls-panel, +audio::-moz-media-controls-play-button, +audio::-moz-media-controls-panel { + background: none; +} diff --git a/app/utils/audio.ts b/app/utils/audio.ts index f6828c7aac4..09453d156b3 100644 --- a/app/utils/audio.ts +++ b/app/utils/audio.ts @@ -43,3 +43,57 @@ export function createTTSPlayer(): TTSPlayer { return { init, play, stop }; } + +export function arrayBufferToWav(buffer: ArrayBuffer): ArrayBuffer { + const numOfChannels = 1; // Mono + const sampleRate = 24000; // 24kHz + const bitsPerSample = 16; + + const bytesPerSample = bitsPerSample / 8; + const blockAlign = numOfChannels * bytesPerSample; + const byteRate = sampleRate * blockAlign; + + // WAV header size is 44 bytes + const wavHeaderSize = 44; + const dataSize = buffer.byteLength; + const totalSize = wavHeaderSize + dataSize; + + const wavBuffer = new ArrayBuffer(totalSize); + const view = new DataView(wavBuffer); + + // RIFF chunk descriptor + writeString(view, 0, "RIFF"); + view.setUint32(4, totalSize - 8, true); // File size minus RIFF header + writeString(view, 8, "WAVE"); + + // FMT sub-chunk + writeString(view, 12, "fmt "); + view.setUint32(16, 16, true); // Sub-chunk size (16 for PCM) + view.setUint16(20, 1, true); // Audio format (1 for PCM) + view.setUint16(22, numOfChannels, true); // Number of channels + view.setUint32(24, sampleRate, true); // Sample rate + view.setUint32(28, byteRate, true); // Byte rate + view.setUint16(32, blockAlign, true); // Block align + view.setUint16(34, bitsPerSample, true); // Bits per sample + + // Data sub-chunk + writeString(view, 36, "data"); + view.setUint32(40, dataSize, true); // Data size + + // Write the PCM samples + const audioData = new Uint8Array(buffer); + const wavData = new Uint8Array(wavBuffer); + wavData.set(audioData, wavHeaderSize); + + return wavBuffer; +} + +// Helper function to write a string to the DataView +function writeString(view: DataView, offset: number, string: string) { + if (offset + string.length > view.byteLength) { + throw new Error("String is too long for the available space in DataView"); + } + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } +} diff --git a/package.json b/package.json index e433445342e..b08601256e8 100644 --- a/package.json +++ b/package.json @@ -33,8 +33,8 @@ "html-to-image": "^1.11.11", "idb-keyval": "^6.2.1", "lodash-es": "^4.17.21", - "mermaid": "^10.6.1", "markdown-to-txt": "^2.0.1", + "mermaid": "^10.6.1", "nanoid": "^5.0.3", "next": "^14.1.1", "node-fetch": "^3.3.1",