From 1b6e128853a3d4c8fe5654ed3374296d2520d144 Mon Sep 17 00:00:00 2001 From: Kevin Schweikert Date: Tue, 22 Oct 2024 23:57:05 +0200 Subject: [PATCH 1/5] use xav instead of ffmpeg --- lib/bumblebee/audio/speech_to_text_whisper.ex | 64 ++++++------------- mix.exs | 5 +- mix.lock | 1 + 3 files changed, 25 insertions(+), 45 deletions(-) diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex index e52e8ce2..b6e28fd7 100644 --- a/lib/bumblebee/audio/speech_to_text_whisper.ex +++ b/lib/bumblebee/audio/speech_to_text_whisper.ex @@ -136,7 +136,7 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do {:ok, [Nx.backend_transfer(input, Nx.BinaryBackend)]} {:file, path} when is_binary(path) -> - ffmpeg_read_as_pcm(path, sampling_rate) + from_file(path, sampling_rate) other -> cond do @@ -164,49 +164,27 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do end end - defp ffmpeg_read_as_pcm(path, sampling_rate) do - channels = 1 + defp from_file(path, sampling_rate) do + # This chunk can be of arbitrary size, the serving accumulates + # and overlaps chunks internally as needed. - format = - case System.endianness() do - :little -> "f32le" - :big -> "f32be" - end - - cond do - System.find_executable("ffmpeg") == nil -> - {:error, "ffmpeg not found in PATH"} - - not File.exists?(path) -> - {:error, "no file found at #{path}"} - - true -> - # This chunk can be of arbitrary size, the serving accumulates - # and overlaps chunks internally as needed. We read the file - # as stream to reduce memory usage - chunk_size = 30 - - stream = - Stream.iterate(0, fn offset -> offset + chunk_size end) - |> Stream.transform({}, fn offset, acc -> - System.cmd( - "ffmpeg", - ~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1] - ) - |> case do - {<<>>, 0} -> - {:halt, acc} - - {data, 0} -> - chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend) - {[chunk], acc} - - {_, 1} -> - raise "ffmpeg failed to decode the given file" - end - end) - - {:ok, stream} + if File.exists?(path) do + stream = + path + |> Xav.Reader.stream!( + read: :audio, + out_format: :f32, + out_channels: 1, + out_sample_rate: sampling_rate + ) + |> Stream.map(fn frame -> Xav.Frame.to_nx(frame) end) + |> Stream.chunk_every(1000) + |> Stream.map(&Nx.Batch.concatenate/1) + |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end) + + {:ok, stream} + else + {:error, "no file found at #{path}"} end end diff --git a/mix.exs b/mix.exs index bbe4aa3b..6f548248 100644 --- a/mix.exs +++ b/mix.exs @@ -34,7 +34,7 @@ defmodule Bumblebee.MixProject do {:axon, "~> 0.7.0"}, # {:axon, github: "elixir-nx/axon", override: true}, {:tokenizers, "~> 0.4"}, - {:nx, "~> 0.9.0"}, + {:nx, "~> 0.9.0", override: true}, {:exla, ">= 0.0.0", only: [:dev, :test]}, {:torchx, ">= 0.0.0", only: [:dev, :test]}, # {:nx, github: "elixir-nx/nx", sparse: "nx", override: true}, @@ -49,7 +49,8 @@ defmodule Bumblebee.MixProject do {:stb_image, "~> 0.6.0", only: :test}, {:bypass, "~> 2.1", only: :test}, {:ex_doc, "~> 0.28", only: :dev, runtime: false}, - {:nx_signal, "~> 0.2.0"} + {:nx_signal, "~> 0.2.0"}, + {:xav, "~> 0.6.0"} ] end diff --git a/mix.lock b/mix.lock index ef31365a..4f4b0fb0 100644 --- a/mix.lock +++ b/mix.lock @@ -36,5 +36,6 @@ "torchx": {:hex, :torchx, "0.9.0", "936cbd32233f89d73700c39b7ef56f94b3f3541db03c90f8ddf6b3fe73260e28", [:mix], [{:nx, "~> 0.9.0", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "4e057d6b93fc91191957230b2c61c408861b888abdf6a900baf0db4125405505"}, "unpickler": {:hex, :unpickler, "0.1.0", "c2262c0819e6985b761e7107546cef96a485f401816be5304a65fdd200d5bd6a", [:mix], [], "hexpm", "e2b3f61e62406187ac52afead8a63bfb4e49394028993f3c4c42712743cab79e"}, "unzip": {:hex, :unzip, "0.12.0", "beed92238724732418b41eba77dcb7f51e235b707406c05b1732a3052d1c0f36", [:mix], [], "hexpm", "95655b72db368e5a84951f0bed586ac053b55ee3815fd96062fce10ce4fc998d"}, + "xav": {:hex, :xav, "0.6.0", "38835d735fc3d620e41c84fe29cd7db0381436b54c9ef209ba9112255a091fc4", [:make, :mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "ad988df9d44c0ca3ccc4053ea0f1f1702ca14d4b926047b224deb527f0866edf"}, "xla": {:hex, :xla, "0.8.0", "fef314d085dd3ee16a0816c095239938f80769150e15db16dfaa435553d7cb16", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "739c61c8d93b97e12ba0369d10e76130224c208f1a76ad293e3581f056833e57"}, } From 7b1c02f5b2acdef1ecc29ffe892b176f34953a9a Mon Sep 17 00:00:00 2001 From: Kevin Schweikert <54439512+kevinschweikert@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:59:51 +0200 Subject: [PATCH 2/5] use default backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jonatan KÅ‚osko --- lib/bumblebee/audio/speech_to_text_whisper.ex | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex index b6e28fd7..df196e7b 100644 --- a/lib/bumblebee/audio/speech_to_text_whisper.ex +++ b/lib/bumblebee/audio/speech_to_text_whisper.ex @@ -177,7 +177,9 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do out_channels: 1, out_sample_rate: sampling_rate ) - |> Stream.map(fn frame -> Xav.Frame.to_nx(frame) end) + |> Stream.map(fn frame -> + Nx.with_default_backend(Nx.BinaryBackend, fn -> Xav.Frame.to_nx(frame) end) + end) |> Stream.chunk_every(1000) |> Stream.map(&Nx.Batch.concatenate/1) |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end) From fd8178677f9151787e4d82cdb5c20d2ca9d2f73d Mon Sep 17 00:00:00 2001 From: Kevin Schweikert Date: Thu, 24 Oct 2024 11:48:54 +0200 Subject: [PATCH 3/5] transform stream to buffer frame binaries --- lib/bumblebee/audio/speech_to_text_whisper.ex | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex index df196e7b..93cc71a8 100644 --- a/lib/bumblebee/audio/speech_to_text_whisper.ex +++ b/lib/bumblebee/audio/speech_to_text_whisper.ex @@ -168,6 +168,8 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do # This chunk can be of arbitrary size, the serving accumulates # and overlaps chunks internally as needed. + chunk_samples = sampling_rate * 30 + if File.exists?(path) do stream = path @@ -177,12 +179,25 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do out_channels: 1, out_sample_rate: sampling_rate ) - |> Stream.map(fn frame -> - Nx.with_default_backend(Nx.BinaryBackend, fn -> Xav.Frame.to_nx(frame) end) - end) - |> Stream.chunk_every(1000) - |> Stream.map(&Nx.Batch.concatenate/1) - |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end) + |> Stream.transform( + fn -> {<<>>, 0} end, + fn frame, {buffer, samples} -> + buffer = buffer <> frame.data + samples = samples + frame.samples + + if samples >= chunk_samples do + chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) + {[chunk], {<<>>, 0}} + else + {[], {buffer, samples}} + end + end, + fn {buffer, _samples} -> + chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) + {[chunk], {<<>>, 0}} + end, + fn _ -> :ok end + ) {:ok, stream} else From fc45ff57c42e5747ba4c3ec1fb9443255532e2fd Mon Sep 17 00:00:00 2001 From: Kevin Schweikert Date: Thu, 24 Oct 2024 11:52:48 +0200 Subject: [PATCH 4/5] bring back ffmpeg --- lib/bumblebee/audio/speech_to_text_whisper.ex | 121 ++++++++++++------ 1 file changed, 85 insertions(+), 36 deletions(-) diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex index 93cc71a8..2008e7bb 100644 --- a/lib/bumblebee/audio/speech_to_text_whisper.ex +++ b/lib/bumblebee/audio/speech_to_text_whisper.ex @@ -164,44 +164,93 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do end end - defp from_file(path, sampling_rate) do - # This chunk can be of arbitrary size, the serving accumulates - # and overlaps chunks internally as needed. - - chunk_samples = sampling_rate * 30 - - if File.exists?(path) do - stream = - path - |> Xav.Reader.stream!( - read: :audio, - out_format: :f32, - out_channels: 1, - out_sample_rate: sampling_rate - ) - |> Stream.transform( - fn -> {<<>>, 0} end, - fn frame, {buffer, samples} -> - buffer = buffer <> frame.data - samples = samples + frame.samples - - if samples >= chunk_samples do + if Code.ensure_loaded?(Xav) do + defp from_file(path, sampling_rate) do + # This chunk can be of arbitrary size, the serving accumulates + # and overlaps chunks internally as needed. + + chunk_size = 30 + chunk_samples = sampling_rate * chunk_size + + if File.exists?(path) do + stream = + path + |> Xav.Reader.stream!( + read: :audio, + out_format: :f32, + out_channels: 1, + out_sample_rate: sampling_rate + ) + |> Stream.transform( + fn -> {<<>>, 0} end, + fn frame, {buffer, samples} -> + buffer = buffer <> frame.data + samples = samples + frame.samples + + if samples >= chunk_samples do + chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) + {[chunk], {<<>>, 0}} + else + {[], {buffer, samples}} + end + end, + fn {buffer, _samples} -> chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) {[chunk], {<<>>, 0}} - else - {[], {buffer, samples}} - end - end, - fn {buffer, _samples} -> - chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) - {[chunk], {<<>>, 0}} - end, - fn _ -> :ok end - ) - - {:ok, stream} - else - {:error, "no file found at #{path}"} + end, + fn _ -> :ok end + ) + + {:ok, stream} + else + {:error, "no file found at #{path}"} + end + end + else + defp from_file(path, sampling_rate) do + channels = 1 + + format = + case System.endianness() do + :little -> "f32le" + :big -> "f32be" + end + + cond do + System.find_executable("ffmpeg") == nil -> + {:error, "ffmpeg not found in PATH"} + + not File.exists?(path) -> + {:error, "no file found at #{path}"} + + true -> + # This chunk can be of arbitrary size, the serving accumulates + # # and overlaps chunks internally as needed. We read the file + # as stream to reduce memory usage + chunk_size = 30 + + stream = + Stream.iterate(0, fn offset -> offset + chunk_size end) + |> Stream.transform({}, fn offset, acc -> + System.cmd( + "ffmpeg", + ~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1] + ) + |> case do + {<<>>, 0} -> + {:halt, acc} + + {data, 0} -> + chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend) + {[chunk], acc} + + {_, 1} -> + raise "ffmpeg failed to decode the given file" + end + end) + + {:ok, stream} + end end end From d882343f72bca17fb077037549d74a0ab40e82d7 Mon Sep 17 00:00:00 2001 From: Kevin Schweikert Date: Thu, 24 Oct 2024 11:55:36 +0200 Subject: [PATCH 5/5] make xav optional --- mix.exs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index 6f548248..565abedb 100644 --- a/mix.exs +++ b/mix.exs @@ -50,7 +50,7 @@ defmodule Bumblebee.MixProject do {:bypass, "~> 2.1", only: :test}, {:ex_doc, "~> 0.28", only: :dev, runtime: false}, {:nx_signal, "~> 0.2.0"}, - {:xav, "~> 0.6.0"} + {:xav, "~> 0.6.0", optional: true} ] end