From 3eac8f44cb954bd73660e37cc78350d88ea8de3a Mon Sep 17 00:00:00 2001 From: Drew Bednar Date: Tue, 23 May 2023 10:26:42 -0400 Subject: [PATCH] Working initial implementation --- .gitignore | 1 - .pre-commit-config.yaml | 29 ++++++++++ README.md | 2 +- local_whisper/webservice.py | 107 ++++++++++++++++++++++++++++++++++++ pyinvoke/autocomplete.sh | 1 - pyproject.toml | 4 ++ requirements.in | 4 +- requirements.txt | 8 +++ 8 files changed, 152 insertions(+), 4 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 local_whisper/webservice.py diff --git a/.gitignore b/.gitignore index 5d381cc..489bdd0 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f63adbf --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +default_stages: [commit, push] +repos: + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.2 + hooks: + - id: shellcheck + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + - repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: "v0.0.263" + hooks: + - id: ruff + # Enable auto fix + # args: [--fix, --exit-non-zero-on-fix] + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) diff --git a/README.md b/README.md index 8cc42b2..0e63c16 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ # LocalWhisper -Making OpenAPI's Open Whisper available via ReST API locally. \ No newline at end of file +Making OpenAPI's Open Whisper available via ReST API locally. diff --git a/local_whisper/webservice.py b/local_whisper/webservice.py new file mode 100644 index 0000000..392a4ea --- /dev/null +++ b/local_whisper/webservice.py @@ -0,0 +1,107 @@ +import os +from threading import Lock +from typing import BinaryIO + +import ffmpeg +import numpy as np +import torch +import whisper +from fastapi import FastAPI, File, UploadFile +from fastapi.responses import RedirectResponse + +from . import __version__ + +# TODO use pydantic config +model_name = os.getenv("ASR_MODEL", "base") +if torch.cuda.is_available(): + model = whisper.load_model(model_name).cuda() +else: + model = whisper.load_model(model_name) +model_lock = Lock() + + +# TODO use pydantic config +SAMPLE_RATE = 16000 + + +# TODO move transcribe to a modeling worker +def transcribe( + audio, + # task: Union[str, None], + # language: Union[str, None], + # initial_prompt: Union[str, None], +): + # options_dict = {"task" : task} + # if language: + # options_dict["language"] = language + # if initial_prompt: + # options_dict["initial_prompt"] = initial_prompt + with model_lock: + # result = model.transcribe(audio, **options_dict) + result = model.transcribe(audio) + + return result + + +# TODO probably can offload this on a worker queue too +def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE): + """ + Open an audio file object and read as mono waveform, resampling as necessary. + Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py + to accept a file object + + Parameters + ---------- + file: BinaryIO + The audio file like object + encode: Boolean + If true, encode audio stream to WAV before sending to whisper + sr: int + The sample rate to resample the audio if necessary + Returns + ------- + A NumPy array containing the audio waveform, in float32 dtype. + """ + if encode: + try: + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + out, _ = ( + ffmpeg.input("pipe:", threads=0) + .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) + .run( + cmd="ffmpeg", + capture_stdout=True, + capture_stderr=True, + input=file.read(), + ) + ) + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + else: + out = file.read() + + return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + + +app = FastAPI( + title="Local Whisper", + description="Making OpenAPI's Open Whisper available via ReST API locally.", + version=__version__, + swagger_ui_parameters={"defaultModelsExpandDepth": -1}, + license_info={ + "name": "MIT License", + }, +) + + +@app.get("/", response_class=RedirectResponse, include_in_schema=False) +async def index(): + return "/docs" + + +@app.post("/audio/transcriptions") +async def asr(file: UploadFile = File(...)): + if file.content_type.startswith("audio/"): + transcription = transcribe(load_audio(file.file)) + return {"text": transcription["text"]} diff --git a/pyinvoke/autocomplete.sh b/pyinvoke/autocomplete.sh index 6279afd..d37d964 100644 --- a/pyinvoke/autocomplete.sh +++ b/pyinvoke/autocomplete.sh @@ -30,4 +30,3 @@ _complete_invoke() { complete -F _complete_invoke -o default invoke inv # vim: set ft=sh : - diff --git a/pyproject.toml b/pyproject.toml index 97ee943..cd7b81a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,10 @@ savant-cli = "local_whisper.cli:main" [project.optional-dependencies] whisper = ["openai-whisper"] +[tool.ruff] +# Allow lines to be as long as 120 characters. +line-length = 120 + [tool.setuptools] packages = ["local_whisper"] diff --git a/requirements.in b/requirements.in index 5855832..22f5b75 100644 --- a/requirements.in +++ b/requirements.in @@ -1,2 +1,4 @@ fastapi -openai-whisper \ No newline at end of file +openai-whisper +uvicorn +python-multipart diff --git a/requirements.txt b/requirements.txt index 46d14c4..eb6c4b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,8 @@ certifi==2023.5.7 # via requests charset-normalizer==3.1.0 # via requests +click==8.1.3 + # via uvicorn cmake==3.26.3 # via triton fastapi==0.95.2 @@ -22,6 +24,8 @@ filelock==3.12.0 # triton future==0.18.3 # via ffmpeg-python +h11==0.14.0 + # via uvicorn idna==3.4 # via # anyio @@ -75,6 +79,8 @@ openai-whisper==20230314 # via -r requirements.in pydantic==1.10.7 # via fastapi +python-multipart==0.0.6 + # via -r requirements.in regex==2023.5.5 # via tiktoken requests==2.31.0 @@ -103,6 +109,8 @@ typing-extensions==4.5.0 # torch urllib3==2.0.2 # via requests +uvicorn==0.22.0 + # via -r requirements.in wheel==0.40.0 # via # nvidia-cublas-cu11