Working initial implementation
							parent
							
								
									4fddd87271
								
							
						
					
					
						commit
						3eac8f44cb
					
				@ -0,0 +1,29 @@
 | 
				
			|||||||
 | 
					default_stages: [commit, push]
 | 
				
			||||||
 | 
					repos:
 | 
				
			||||||
 | 
					  - repo: https://github.com/shellcheck-py/shellcheck-py
 | 
				
			||||||
 | 
					    rev: v0.9.0.2
 | 
				
			||||||
 | 
					    hooks:
 | 
				
			||||||
 | 
					      - id: shellcheck
 | 
				
			||||||
 | 
					  - repo: https://github.com/pre-commit/pre-commit-hooks
 | 
				
			||||||
 | 
					    rev: v3.2.0
 | 
				
			||||||
 | 
					    hooks:
 | 
				
			||||||
 | 
					      - id: trailing-whitespace
 | 
				
			||||||
 | 
					      - id: end-of-file-fixer
 | 
				
			||||||
 | 
					      - id: check-yaml
 | 
				
			||||||
 | 
					      - id: check-added-large-files
 | 
				
			||||||
 | 
					  - repo: https://github.com/psf/black
 | 
				
			||||||
 | 
					    rev: 23.3.0
 | 
				
			||||||
 | 
					    hooks:
 | 
				
			||||||
 | 
					      - id: black
 | 
				
			||||||
 | 
					  - repo: https://github.com/charliermarsh/ruff-pre-commit
 | 
				
			||||||
 | 
					    # Ruff version.
 | 
				
			||||||
 | 
					    rev: "v0.0.263"
 | 
				
			||||||
 | 
					    hooks:
 | 
				
			||||||
 | 
					      - id: ruff
 | 
				
			||||||
 | 
					      # Enable auto fix
 | 
				
			||||||
 | 
					      # args: [--fix, --exit-non-zero-on-fix]
 | 
				
			||||||
 | 
					  - repo: https://github.com/pycqa/isort
 | 
				
			||||||
 | 
					    rev: 5.12.0
 | 
				
			||||||
 | 
					    hooks:
 | 
				
			||||||
 | 
					      - id: isort
 | 
				
			||||||
 | 
					        name: isort (python)
 | 
				
			||||||
@ -0,0 +1,107 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					from threading import Lock
 | 
				
			||||||
 | 
					from typing import BinaryIO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import ffmpeg
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					import whisper
 | 
				
			||||||
 | 
					from fastapi import FastAPI, File, UploadFile
 | 
				
			||||||
 | 
					from fastapi.responses import RedirectResponse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from . import __version__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO use pydantic config
 | 
				
			||||||
 | 
					model_name = os.getenv("ASR_MODEL", "base")
 | 
				
			||||||
 | 
					if torch.cuda.is_available():
 | 
				
			||||||
 | 
					    model = whisper.load_model(model_name).cuda()
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
					    model = whisper.load_model(model_name)
 | 
				
			||||||
 | 
					model_lock = Lock()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO use pydantic config
 | 
				
			||||||
 | 
					SAMPLE_RATE = 16000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO move transcribe to a modeling worker
 | 
				
			||||||
 | 
					def transcribe(
 | 
				
			||||||
 | 
					    audio,
 | 
				
			||||||
 | 
					    # task: Union[str, None],
 | 
				
			||||||
 | 
					    # language: Union[str, None],
 | 
				
			||||||
 | 
					    # initial_prompt: Union[str, None],
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    # options_dict = {"task" : task}
 | 
				
			||||||
 | 
					    # if language:
 | 
				
			||||||
 | 
					    #     options_dict["language"] = language
 | 
				
			||||||
 | 
					    # if initial_prompt:
 | 
				
			||||||
 | 
					    #     options_dict["initial_prompt"] = initial_prompt
 | 
				
			||||||
 | 
					    with model_lock:
 | 
				
			||||||
 | 
					        # result = model.transcribe(audio, **options_dict)
 | 
				
			||||||
 | 
					        result = model.transcribe(audio)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO probably can offload this on a worker queue too
 | 
				
			||||||
 | 
					def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Open an audio file object and read as mono waveform, resampling as necessary.
 | 
				
			||||||
 | 
					    Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py
 | 
				
			||||||
 | 
					    to accept a file object
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Parameters
 | 
				
			||||||
 | 
					    ----------
 | 
				
			||||||
 | 
					    file: BinaryIO
 | 
				
			||||||
 | 
					        The audio file like object
 | 
				
			||||||
 | 
					    encode: Boolean
 | 
				
			||||||
 | 
					        If true, encode audio stream to WAV before sending to whisper
 | 
				
			||||||
 | 
					    sr: int
 | 
				
			||||||
 | 
					        The sample rate to resample the audio if necessary
 | 
				
			||||||
 | 
					    Returns
 | 
				
			||||||
 | 
					    -------
 | 
				
			||||||
 | 
					    A NumPy array containing the audio waveform, in float32 dtype.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if encode:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 | 
				
			||||||
 | 
					            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
 | 
				
			||||||
 | 
					            out, _ = (
 | 
				
			||||||
 | 
					                ffmpeg.input("pipe:", threads=0)
 | 
				
			||||||
 | 
					                .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
 | 
				
			||||||
 | 
					                .run(
 | 
				
			||||||
 | 
					                    cmd="ffmpeg",
 | 
				
			||||||
 | 
					                    capture_stdout=True,
 | 
				
			||||||
 | 
					                    capture_stderr=True,
 | 
				
			||||||
 | 
					                    input=file.read(),
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        except ffmpeg.Error as e:
 | 
				
			||||||
 | 
					            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        out = file.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					app = FastAPI(
 | 
				
			||||||
 | 
					    title="Local Whisper",
 | 
				
			||||||
 | 
					    description="Making OpenAPI's Open Whisper available via ReST API locally.",
 | 
				
			||||||
 | 
					    version=__version__,
 | 
				
			||||||
 | 
					    swagger_ui_parameters={"defaultModelsExpandDepth": -1},
 | 
				
			||||||
 | 
					    license_info={
 | 
				
			||||||
 | 
					        "name": "MIT License",
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@app.get("/", response_class=RedirectResponse, include_in_schema=False)
 | 
				
			||||||
 | 
					async def index():
 | 
				
			||||||
 | 
					    return "/docs"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@app.post("/audio/transcriptions")
 | 
				
			||||||
 | 
					async def asr(file: UploadFile = File(...)):
 | 
				
			||||||
 | 
					    if file.content_type.startswith("audio/"):
 | 
				
			||||||
 | 
					        transcription = transcribe(load_audio(file.file))
 | 
				
			||||||
 | 
					        return {"text": transcription["text"]}
 | 
				
			||||||
@ -1,2 +1,4 @@
 | 
				
			|||||||
fastapi
 | 
					fastapi
 | 
				
			||||||
openai-whisper
 | 
					openai-whisper
 | 
				
			||||||
 | 
					uvicorn
 | 
				
			||||||
 | 
					python-multipart
 | 
				
			||||||
 | 
				
			|||||||
					Loading…
					
					
				
		Reference in New Issue