Working initial implementation

3 years ago · 3eac8f44cb
parent 4fddd87271
commit 3eac8f44cb
8 changed files with 152 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -159,4 +159,3 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,29 @@
+default_stages: [commit, push]
+repos:
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.9.0.2
+    hooks:
+      - id: shellcheck
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: "v0.0.263"
+    hooks:
+      - id: ruff
+      # Enable auto fix
+      # args: [--fix, --exit-non-zero-on-fix]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
--- a/local_whisper/webservice.py
+++ b/local_whisper/webservice.py
@ -0,0 +1,107 @@
+import os
+from threading import Lock
+from typing import BinaryIO
+
+import ffmpeg
+import numpy as np
+import torch
+import whisper
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import RedirectResponse
+
+from . import __version__
+
+# TODO use pydantic config
+model_name = os.getenv("ASR_MODEL", "base")
+if torch.cuda.is_available():
+    model = whisper.load_model(model_name).cuda()
+else:
+    model = whisper.load_model(model_name)
+model_lock = Lock()
+
+
+# TODO use pydantic config
+SAMPLE_RATE = 16000
+
+
+# TODO move transcribe to a modeling worker
+def transcribe(
+    audio,
+    # task: Union[str, None],
+    # language: Union[str, None],
+    # initial_prompt: Union[str, None],
+):
+    # options_dict = {"task" : task}
+    # if language:
+    #     options_dict["language"] = language
+    # if initial_prompt:
+    #     options_dict["initial_prompt"] = initial_prompt
+    with model_lock:
+        # result = model.transcribe(audio, **options_dict)
+        result = model.transcribe(audio)
+
+    return result
+
+
+# TODO probably can offload this on a worker queue too
+def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file object and read as mono waveform, resampling as necessary.
+    Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py
+    to accept a file object
+
+    Parameters
+    ----------
+    file: BinaryIO
+        The audio file like object
+    encode: Boolean
+        If true, encode audio stream to WAV before sending to whisper
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    if encode:
+        try:
+            # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+            out, _ = (
+                ffmpeg.input("pipe:", threads=0)
+                .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+                .run(
+                    cmd="ffmpeg",
+                    capture_stdout=True,
+                    capture_stderr=True,
+                    input=file.read(),
+                )
+            )
+        except ffmpeg.Error as e:
+            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    else:
+        out = file.read()
+
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+
+app = FastAPI(
+    title="Local Whisper",
+    description="Making OpenAPI's Open Whisper available via ReST API locally.",
+    version=__version__,
+    swagger_ui_parameters={"defaultModelsExpandDepth": -1},
+    license_info={
+        "name": "MIT License",
+    },
+)
+
+
+@app.get("/", response_class=RedirectResponse, include_in_schema=False)
+async def index():
+    return "/docs"
+
+
+@app.post("/audio/transcriptions")
+async def asr(file: UploadFile = File(...)):
+    if file.content_type.startswith("audio/"):
+        transcription = transcribe(load_audio(file.file))
+        return {"text": transcription["text"]}
--- a/pyinvoke/autocomplete.sh
+++ b/pyinvoke/autocomplete.sh
@ -30,4 +30,3 @@ _complete_invoke() {
 complete -F _complete_invoke -o default invoke inv

 # vim: set ft=sh :
-
--- a/pyproject.toml
+++ b/pyproject.toml
@ -23,6 +23,10 @@ savant-cli = "local_whisper.cli:main"
 [project.optional-dependencies]
 whisper = ["openai-whisper"]

+[tool.ruff]
+# Allow lines to be as long as 120 characters.
+line-length = 120
+
 [tool.setuptools]
 packages = ["local_whisper"]

--- a/requirements.in
+++ b/requirements.in
@ -1,2 +1,4 @@
 fastapi
 openai-whisper
+uvicorn
+python-multipart
--- a/requirements.txt
+++ b/requirements.txt
@ -10,6 +10,8 @@ certifi==2023.5.7
    # via requests
 charset-normalizer==3.1.0
    # via requests
+click==8.1.3
+    # via uvicorn
 cmake==3.26.3
    # via triton
 fastapi==0.95.2
@ -22,6 +24,8 @@ filelock==3.12.0
    #   triton
 future==0.18.3
    # via ffmpeg-python
+h11==0.14.0
+    # via uvicorn
 idna==3.4
    # via
    #   anyio
@ -75,6 +79,8 @@ openai-whisper==20230314
    # via -r requirements.in
 pydantic==1.10.7
    # via fastapi
+python-multipart==0.0.6
+    # via -r requirements.in
 regex==2023.5.5
    # via tiktoken
 requests==2.31.0
@ -103,6 +109,8 @@ typing-extensions==4.5.0
    #   torch
 urllib3==2.0.2
    # via requests
+uvicorn==0.22.0
+    # via -r requirements.in
 wheel==0.40.0
    # via
    #   nvidia-cublas-cu11