Working initial implementation

3 years ago · 3eac8f44cb
parent 4fddd87271
commit 3eac8f44cb
8 changed files with 152 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -159,4 +159,3 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,29 @@
 default_stages: [commit, push]
 repos:
  - repo: https://github.com/shellcheck-py/shellcheck-py
    rev: v0.9.0.2
    hooks:
      - id: shellcheck
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v3.2.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-added-large-files
  - repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
      - id: black
  - repo: https://github.com/charliermarsh/ruff-pre-commit
    # Ruff version.
    rev: "v0.0.263"
    hooks:
      - id: ruff
      # Enable auto fix
      # args: [--fix, --exit-non-zero-on-fix]
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        name: isort (python)
--- a/local_whisper/webservice.py
+++ b/local_whisper/webservice.py
@ -0,0 +1,107 @@
 import os
 from threading import Lock
 from typing import BinaryIO
 import ffmpeg
 import numpy as np
 import torch
 import whisper
 from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import RedirectResponse
 from . import __version__
 # TODO use pydantic config
 model_name = os.getenv("ASR_MODEL", "base")
 if torch.cuda.is_available():
    model = whisper.load_model(model_name).cuda()
 else:
    model = whisper.load_model(model_name)
 model_lock = Lock()
 # TODO use pydantic config
 SAMPLE_RATE = 16000
 # TODO move transcribe to a modeling worker
 def transcribe(
    audio,
    # task: Union[str, None],
    # language: Union[str, None],
    # initial_prompt: Union[str, None],
 ):
    # options_dict = {"task" : task}
    # if language:
    #     options_dict["language"] = language
    # if initial_prompt:
    #     options_dict["initial_prompt"] = initial_prompt
    with model_lock:
        # result = model.transcribe(audio, **options_dict)
        result = model.transcribe(audio)
    return result
 # TODO probably can offload this on a worker queue too
 def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
    """
    Open an audio file object and read as mono waveform, resampling as necessary.
    Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py
    to accept a file object
    Parameters
    ----------
    file: BinaryIO
        The audio file like object
    encode: Boolean
        If true, encode audio stream to WAV before sending to whisper
    sr: int
        The sample rate to resample the audio if necessary
    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    if encode:
        try:
            # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
            out, _ = (
                ffmpeg.input("pipe:", threads=0)
                .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
                .run(
                    cmd="ffmpeg",
                    capture_stdout=True,
                    capture_stderr=True,
                    input=file.read(),
                )
            )
        except ffmpeg.Error as e:
            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
    else:
        out = file.read()
    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 app = FastAPI(
    title="Local Whisper",
    description="Making OpenAPI's Open Whisper available via ReST API locally.",
    version=__version__,
    swagger_ui_parameters={"defaultModelsExpandDepth": -1},
    license_info={
        "name": "MIT License",
    },
 )
@app.get("/", response_class=RedirectResponse, include_in_schema=False)
 async def index():
    return "/docs"
@app.post("/audio/transcriptions")
 async def asr(file: UploadFile = File(...)):
    if file.content_type.startswith("audio/"):
        transcription = transcribe(load_audio(file.file))
        return {"text": transcription["text"]}
--- a/pyinvoke/autocomplete.sh
+++ b/pyinvoke/autocomplete.sh
@ -30,4 +30,3 @@ _complete_invoke() {
 complete -F _complete_invoke -o default invoke inv
 # vim: set ft=sh :
--- a/pyproject.toml
+++ b/pyproject.toml
@ -23,6 +23,10 @@ savant-cli = "local_whisper.cli:main"
 [project.optional-dependencies]
 whisper = ["openai-whisper"]
 [tool.ruff]
 # Allow lines to be as long as 120 characters.
 line-length = 120
 [tool.setuptools]
 packages = ["local_whisper"]
--- a/requirements.in
+++ b/requirements.in
@ -1,2 +1,4 @@
 fastapi
 openai-whisper
 uvicorn
 python-multipart
--- a/requirements.txt
+++ b/requirements.txt
@ -10,6 +10,8 @@ certifi==2023.5.7
    # via requests
 charset-normalizer==3.1.0
    # via requests
 click==8.1.3
    # via uvicorn
 cmake==3.26.3
    # via triton
 fastapi==0.95.2
@ -22,6 +24,8 @@ filelock==3.12.0
    #   triton
 future==0.18.3
    # via ffmpeg-python
 h11==0.14.0
    # via uvicorn
 idna==3.4
    # via
    #   anyio
@ -75,6 +79,8 @@ openai-whisper==20230314
    # via -r requirements.in
 pydantic==1.10.7
    # via fastapi
 python-multipart==0.0.6
    # via -r requirements.in
 regex==2023.5.5
    # via tiktoken
 requests==2.31.0
@ -103,6 +109,8 @@ typing-extensions==4.5.0
    #   torch
 urllib3==2.0.2
    # via requests
 uvicorn==0.22.0
    # via -r requirements.in
 wheel==0.40.0
    # via
    #   nvidia-cublas-cu11
`@ -30,4 +30,3 @@ _complete_invoke() {`
	`complete -F _complete_invoke -o default invoke inv`	`complete -F _complete_invoke -o default invoke inv`

	`# vim: set ft=sh :`	`# vim: set ft=sh :`