diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex index e52e8ce2..2008e7bb 100644 --- a/lib/bumblebee/audio/speech_to_text_whisper.ex +++ b/lib/bumblebee/audio/speech_to_text_whisper.ex @@ -136,7 +136,7 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do {:ok, [Nx.backend_transfer(input, Nx.BinaryBackend)]} {:file, path} when is_binary(path) -> - ffmpeg_read_as_pcm(path, sampling_rate) + from_file(path, sampling_rate) other -> cond do @@ -164,49 +164,93 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do end end - defp ffmpeg_read_as_pcm(path, sampling_rate) do - channels = 1 + if Code.ensure_loaded?(Xav) do + defp from_file(path, sampling_rate) do + # This chunk can be of arbitrary size, the serving accumulates + # and overlaps chunks internally as needed. - format = - case System.endianness() do - :little -> "f32le" - :big -> "f32be" - end + chunk_size = 30 + chunk_samples = sampling_rate * chunk_size - cond do - System.find_executable("ffmpeg") == nil -> - {:error, "ffmpeg not found in PATH"} + if File.exists?(path) do + stream = + path + |> Xav.Reader.stream!( + read: :audio, + out_format: :f32, + out_channels: 1, + out_sample_rate: sampling_rate + ) + |> Stream.transform( + fn -> {<<>>, 0} end, + fn frame, {buffer, samples} -> + buffer = buffer <> frame.data + samples = samples + frame.samples + + if samples >= chunk_samples do + chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) + {[chunk], {<<>>, 0}} + else + {[], {buffer, samples}} + end + end, + fn {buffer, _samples} -> + chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend) + {[chunk], {<<>>, 0}} + end, + fn _ -> :ok end + ) - not File.exists?(path) -> + {:ok, stream} + else {:error, "no file found at #{path}"} + end + end + else + defp from_file(path, sampling_rate) do + channels = 1 + + format = + case System.endianness() do + :little -> "f32le" + :big -> "f32be" + end - true -> - # This chunk can be of arbitrary size, the serving accumulates - # and overlaps chunks internally as needed. We read the file - # as stream to reduce memory usage - chunk_size = 30 + cond do + System.find_executable("ffmpeg") == nil -> + {:error, "ffmpeg not found in PATH"} - stream = - Stream.iterate(0, fn offset -> offset + chunk_size end) - |> Stream.transform({}, fn offset, acc -> - System.cmd( - "ffmpeg", - ~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1] - ) - |> case do - {<<>>, 0} -> - {:halt, acc} - - {data, 0} -> - chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend) - {[chunk], acc} - - {_, 1} -> - raise "ffmpeg failed to decode the given file" - end - end) + not File.exists?(path) -> + {:error, "no file found at #{path}"} - {:ok, stream} + true -> + # This chunk can be of arbitrary size, the serving accumulates + # # and overlaps chunks internally as needed. We read the file + # as stream to reduce memory usage + chunk_size = 30 + + stream = + Stream.iterate(0, fn offset -> offset + chunk_size end) + |> Stream.transform({}, fn offset, acc -> + System.cmd( + "ffmpeg", + ~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1] + ) + |> case do + {<<>>, 0} -> + {:halt, acc} + + {data, 0} -> + chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend) + {[chunk], acc} + + {_, 1} -> + raise "ffmpeg failed to decode the given file" + end + end) + + {:ok, stream} + end end end diff --git a/mix.exs b/mix.exs index bbe4aa3b..565abedb 100644 --- a/mix.exs +++ b/mix.exs @@ -34,7 +34,7 @@ defmodule Bumblebee.MixProject do {:axon, "~> 0.7.0"}, # {:axon, github: "elixir-nx/axon", override: true}, {:tokenizers, "~> 0.4"}, - {:nx, "~> 0.9.0"}, + {:nx, "~> 0.9.0", override: true}, {:exla, ">= 0.0.0", only: [:dev, :test]}, {:torchx, ">= 0.0.0", only: [:dev, :test]}, # {:nx, github: "elixir-nx/nx", sparse: "nx", override: true}, @@ -49,7 +49,8 @@ defmodule Bumblebee.MixProject do {:stb_image, "~> 0.6.0", only: :test}, {:bypass, "~> 2.1", only: :test}, {:ex_doc, "~> 0.28", only: :dev, runtime: false}, - {:nx_signal, "~> 0.2.0"} + {:nx_signal, "~> 0.2.0"}, + {:xav, "~> 0.6.0", optional: true} ] end diff --git a/mix.lock b/mix.lock index ef31365a..4f4b0fb0 100644 --- a/mix.lock +++ b/mix.lock @@ -36,5 +36,6 @@ "torchx": {:hex, :torchx, "0.9.0", "936cbd32233f89d73700c39b7ef56f94b3f3541db03c90f8ddf6b3fe73260e28", [:mix], [{:nx, "~> 0.9.0", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "4e057d6b93fc91191957230b2c61c408861b888abdf6a900baf0db4125405505"}, "unpickler": {:hex, :unpickler, "0.1.0", "c2262c0819e6985b761e7107546cef96a485f401816be5304a65fdd200d5bd6a", [:mix], [], "hexpm", "e2b3f61e62406187ac52afead8a63bfb4e49394028993f3c4c42712743cab79e"}, "unzip": {:hex, :unzip, "0.12.0", "beed92238724732418b41eba77dcb7f51e235b707406c05b1732a3052d1c0f36", [:mix], [], "hexpm", "95655b72db368e5a84951f0bed586ac053b55ee3815fd96062fce10ce4fc998d"}, + "xav": {:hex, :xav, "0.6.0", "38835d735fc3d620e41c84fe29cd7db0381436b54c9ef209ba9112255a091fc4", [:make, :mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "ad988df9d44c0ca3ccc4053ea0f1f1702ca14d4b926047b224deb527f0866edf"}, "xla": {:hex, :xla, "0.8.0", "fef314d085dd3ee16a0816c095239938f80769150e15db16dfaa435553d7cb16", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "739c61c8d93b97e12ba0369d10e76130224c208f1a76ad293e3581f056833e57"}, }