From 4e7cd1b366d7efec4c7bd4043eb907534c30727a Mon Sep 17 00:00:00 2001 From: Drew Bednar Date: Sat, 3 Feb 2024 16:41:37 -0500 Subject: [PATCH] Hello split-it --- .coveragerc | 6 ++ .drone.yml | 44 +++++++++++++ .gitignore | 62 ++++++++++++++++++ .pre-commit-config.yaml | 40 ++++++++++++ LICENSE | 10 +++ README.md | 26 ++++++++ dev-requirements.in | 12 ++++ dev-requirements.txt | 89 +++++++++++++++++++++++++ entrypoint.sh | 8 +++ pyproject.toml | 59 +++++++++++++++++ requirements.in | 2 + requirements.txt | 10 +++ scripts/_common.sh | 10 +++ scripts/bootstrap.sh | 47 +++++++++++++ scripts/run_linters.sh | 28 ++++++++ scripts/run_unit_tests.sh | 12 ++++ split_it/__init__.py | 0 split_it/cli.py | 134 ++++++++++++++++++++++++++++++++++++++ tasks.py | 52 +++++++++++++++ tests/__init__.py | 0 tests/conftest.py | 0 tests/test_example.py | 6 ++ 22 files changed, 657 insertions(+) create mode 100644 .coveragerc create mode 100644 .drone.yml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 dev-requirements.in create mode 100644 dev-requirements.txt create mode 100755 entrypoint.sh create mode 100644 pyproject.toml create mode 100644 requirements.in create mode 100644 requirements.txt create mode 100644 scripts/_common.sh create mode 100755 scripts/bootstrap.sh create mode 100755 scripts/run_linters.sh create mode 100755 scripts/run_unit_tests.sh create mode 100644 split_it/__init__.py create mode 100644 split_it/cli.py create mode 100644 tasks.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_example.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..dd63330 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,6 @@ +[run] +source = split_it +omit = test* + +[report] +show_missing = True diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..b1ccb6c --- /dev/null +++ b/.drone.yml @@ -0,0 +1,44 @@ +kind: pipeline +type: docker +name: CI Test/Lint Pipeline + +steps: +- name: Unit Tests and Linters + # Bullseye because drone runner host OS is using older libseccomp2 causing issues + # with thread allocation. See: https://github.com/docker-library/python/issues/835 + image: python:3.11-bullseye + commands: + - bash -xc './scripts/run_linters.sh' + - bash -xc './scripts/run_unit_tests.sh' + group: test-lint + +trigger: + event: + - pull_request + - push + +# Secrets used to pull private images +image_pull_secrets: + - dockerconfigjson + +--- +kind: pipeline +type: docker +name: Build Production Image +steps: +- name: Build split_it Container Image + image: plugins/docker + settings: + username: automate + password: + from_secret: automate_password + dockerfile: Dockerfile + registry: registry.runcible.io + repo: registry.runcible.io/split_it + tags: + - ${DRONE_COMMIT_SHA} + when: + branch: + - master + event: + - push diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e8afa11 --- /dev/null +++ b/.gitignore @@ -0,0 +1,62 @@ +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Pycharm +.idea diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2a5342e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,40 @@ +default_stages: [commit, push] +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: [--allow-multiple-documents] + - id: check-added-large-files + - id: debug-statements + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.5 + hooks: + - id: shellcheck + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + args: ["split_it", "./tests/"] + types: [ python ] + - repo: local + hooks: + - id: isort + name: isort + entry: isort + language: system + types: [python] + require_serial: true + - id: ruff + name: ruff + entry: ruff + language: system + types: [python] + args: [ + # '-rn', # Only display messages + # '-sn', # Don't display the score + # '--disable=C,R,W0511', # Disable C and R type messages, and TODO fixme warning + ] + require_serial: true diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..555053c --- /dev/null +++ b/LICENSE @@ -0,0 +1,10 @@ + +MIT License + +Copyright (c) 2024, Drew Bednar + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6268102 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# split_it + +Tool for Splitting Audio + +## First time setup + +Create a virtual environment and activate it. Now from the root project directory run `./scripts/bootstrap`. This will install `pip-tools` and sync any dependencies for the first time. + +## Dependency management + +Dependencies are managed via [pip-tools]. + +### Adding a dependency + +To add a dependency, edit `requirements.in` (or `dev-requirements.in` +for dev dependencies) and add your dependency then run `pip-compile +requirements.in`. + +### Syncing dependencies + +Run `pip-sync requirements.txt dev_requirements.txt`. + +## Testing + +Run the tests by invoking `py.test` in the project root. Make sure you +run any pending migrations beforehand. diff --git a/dev-requirements.in b/dev-requirements.in new file mode 100644 index 0000000..557d1fa --- /dev/null +++ b/dev-requirements.in @@ -0,0 +1,12 @@ +-c ./requirements.txt + +black +build +invoke +isort<=5.12.0 +pip-tools +pre-commit +pytest +pytest-cov +shellcheck-py==0.9.0.5 +ruff diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..e881137 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,89 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile dev-requirements.in +# +black==24.1.1 + # via -r dev-requirements.in +build==1.0.3 + # via + # -r dev-requirements.in + # pip-tools +cfgv==3.4.0 + # via pre-commit +click==8.1.7 + # via + # black + # pip-tools +coverage[toml]==7.4.1 + # via + # coverage + # pytest-cov +distlib==0.3.8 + # via virtualenv +exceptiongroup==1.2.0 + # via pytest +filelock==3.13.1 + # via virtualenv +identify==2.5.33 + # via pre-commit +iniconfig==2.0.0 + # via pytest +invoke==2.2.0 + # via -r dev-requirements.in +isort==5.12.0 + # via -r dev-requirements.in +mypy-extensions==1.0.0 + # via black +nodeenv==1.8.0 + # via pre-commit +packaging==23.2 + # via + # black + # build + # pytest +pathspec==0.12.1 + # via black +pip-tools==7.3.0 + # via -r dev-requirements.in +platformdirs==4.2.0 + # via + # black + # virtualenv +pluggy==1.4.0 + # via pytest +pre-commit==3.6.0 + # via -r dev-requirements.in +pyproject-hooks==1.0.0 + # via build +pytest==8.0.0 + # via + # -r dev-requirements.in + # pytest-cov +pytest-cov==4.1.0 + # via -r dev-requirements.in +pyyaml==6.0.1 + # via pre-commit +ruff==0.2.0 + # via -r dev-requirements.in +shellcheck-py==0.9.0.5 + # via -r dev-requirements.in +tomli==2.0.1 + # via + # black + # build + # coverage + # pip-tools + # pyproject-hooks + # pytest +typing-extensions==4.9.0 + # via black +virtualenv==20.25.0 + # via pre-commit +wheel==0.42.0 + # via pip-tools + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..42aa2b7 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Best practice: Bash strict mode. +set -euo pipefail + +# Best practice: Make sure the image shuts down correctly by using `exec` in +# entry point shell scripts. +exec "$@" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2a0c26e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,59 @@ +[project] +name = "split_it" +version = "0.1.0" +authors = [ + {name = "Drew Bednar", email = "drew@runcible.io"}, +] +description = "Tool for Splitting Audio" +requires-python = ">=3.10" +license = {text = "MIT"} +classifiers = [ + "Programming Language :: Python :: 3", +] +dynamic = ["readme", "dependencies"] + +[tool.setuptools.dynamic] +readme = {file = ["README.md"], content-type = "text/markdown"} +dependencies = {file = ["requirements.txt"]} + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.isort] +profile = "black" +line_length = 120 +force_single_line = true +filter_files = true + +[tool.black] +line-length = 120 +skip-string-normalization = true +exclude = "(^/\\.git|^/env/|^/venv/|^/node_modules/)" + +[tool.ruff] +line-length = 120 +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", +] diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..0b46f1b --- /dev/null +++ b/requirements.in @@ -0,0 +1,2 @@ +pydub +tqdm diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dca89b9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +pydub==0.25.1 + # via -r requirements.in +tqdm==4.66.1 + # via -r requirements.in diff --git a/scripts/_common.sh b/scripts/_common.sh new file mode 100644 index 0000000..b10e732 --- /dev/null +++ b/scripts/_common.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +function _setup_env() { + # build a virtual env + if [ -z "${VIRTUAL_ENV}" ]; then + python3 -m venv env + # shellcheck source=/dev/null + source env/bin/activate + fi +} diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh new file mode 100755 index 0000000..0019db8 --- /dev/null +++ b/scripts/bootstrap.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# setting -e to exit immediately on a command failure. +# set -o pipefail sets the exit code of a pipeline to +# that of the rightmost command to exit with a non-zero status, +# or to zero if all commands of the pipeline exit successfully. +# set -u affects variables. When set, a reference to any variable +# you haven't previously defined - with the exceptions of $* and $@ - +# is an error, and causes the program to immediately exit +set -eo pipefail +IFS=$'\n\t' + +# shellcheck source=/dev/null +source "$(dirname "$0")/_common.sh" + +function project_bootstrap() { + _setup_env + pip install -U pip pip-tools=='7.3.0' + + # Check if requirements.txt exists, if not, compile it from requirements.in + if [ ! -f requirements.txt ]; then + pip-compile requirements.in + fi + + # Check if dev-requirements.txt exists, if not, compile it from dev-requirements.in + if [ ! -f dev-requirements.txt ]; then + pip-compile dev-requirements.in + fi + + pip-sync requirements.txt dev-requirements.txt +} + +function install_precommit_hooks() { + if [ ! -d ".git" ]; then + git init + fi + + if [ -z "$VIRTUAL_ENV" ]; then + echo "warning: you are not in a virtualenv" + exit 1 + fi + + pre-commit install +} + +project_bootstrap +install_precommit_hooks diff --git a/scripts/run_linters.sh b/scripts/run_linters.sh new file mode 100755 index 0000000..1659177 --- /dev/null +++ b/scripts/run_linters.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +# shellcheck source=/dev/null +source "$(dirname "$0")/_common.sh" + +if [ "${DRONE}" == "true" ]; then + _setup_env + pip install -r requirements.txt -r dev-requirements.txt +fi + +# Run linting commands and capture their return codes +"${VIRTUAL_ENV}/bin/python3" -m isort --check ./speech_collect ./tests ./tasks.py +ISORT_EXIT_CODE=$? + +"${VIRTUAL_ENV}/bin/python3" -m black --check ./speech_collect ./tests ./tasks.py +BLACK_EXIT_CODE=$? + +"${VIRTUAL_ENV}/bin/python3" -m ruff ./speech_collect ./tests ./tasks.py +RUFF_EXIT_CODE=$? + +# Check if any linting command failed +if [ $ISORT_EXIT_CODE -ne 0 ] || [ $BLACK_EXIT_CODE -ne 0 ] || [ $RUFF_EXIT_CODE -ne 0 ]; then + echo "Some linting checks failed" + # Exit with a non-zero status, you can choose which error code to return + exit 1 +fi + +echo "All linting checks passed" diff --git a/scripts/run_unit_tests.sh b/scripts/run_unit_tests.sh new file mode 100755 index 0000000..d0ba374 --- /dev/null +++ b/scripts/run_unit_tests.sh @@ -0,0 +1,12 @@ +#! /usr/bin/env bash +set -ex + +# shellcheck source=/dev/null +source "$(dirname "$0")/_common.sh" + +if [ "${DRONE}" == "true" ]; then + _setup_env + pip install -r requirements.txt -r dev-requirements.txt +fi + +exec "${VIRTUAL_ENV}/bin/python3" -m pytest -vv --cov "$@" diff --git a/split_it/__init__.py b/split_it/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/split_it/cli.py b/split_it/cli.py new file mode 100644 index 0000000..40d7eef --- /dev/null +++ b/split_it/cli.py @@ -0,0 +1,134 @@ +import argparse +import os +from concurrent.futures import ThreadPoolExecutor + +from pydub import AudioSegment +from pydub.silence import split_on_silence +from tqdm import tqdm + + +def save_chunk(chunk, start_time, output_dir, output_format): + chunk.export(os.path.join(output_dir, f'chunk_{start_time}.{output_format}'), format=output_format) + + +def merge_short_chunks(chunks, min_chunk_length_ms): + merged_chunks = [] + current_chunk = chunks[0] + + for chunk in chunks[1:]: + if len(current_chunk) + len(chunk) < min_chunk_length_ms: + current_chunk += chunk + else: + merged_chunks.append(current_chunk) + current_chunk = chunk + + merged_chunks.append(current_chunk) + return merged_chunks + + +def split_audio( + input_file, output_dir, chunk_length_ms, output_format, silence_based, silence_threshold, silence_min_len +): + # Load the input audio file using Pydub + audio = AudioSegment.from_file(input_file) + + # Create the output directory if it doesn't exist + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if silence_based: + # Split the audio file based on silence + min_silence_len = silence_min_len + silence_thresh = silence_threshold + chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh) + + # Merge adjacent chunks shorter than the specified length + chunks = merge_short_chunks(chunks, chunk_length_ms) + + # Set up progress bar with tqdm + pbar = tqdm(total=len(chunks), desc="Processing chunks based on silence") + + # Save chunks in parallel using ThreadPoolExecutor + with ThreadPoolExecutor() as executor: + for i, chunk in enumerate(chunks): + executor.submit(save_chunk, chunk, i, output_dir, output_format).add_done_callback( + lambda x: pbar.update(1) + ) + + else: + # Calculate the total length of the audio in milliseconds and the number of full chunks + audio_length_ms = len(audio) + num_chunks = audio_length_ms // chunk_length_ms + + # Set up progress bar with tqdm + pbar = tqdm(total=num_chunks + (audio_length_ms % chunk_length_ms != 0), desc="Processing fixed-size chunks") + + # Split and save chunks in parallel using ThreadPoolExecutor + with ThreadPoolExecutor() as executor: + for i in range(num_chunks): + start_time = i * chunk_length_ms + end_time = (i + 1) * chunk_length_ms + chunk = audio[start_time:end_time] + executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback( + lambda x: pbar.update(1) + ) + + # Handle the last chunk if there is any remainder + if audio_length_ms % chunk_length_ms != 0: + start_time = num_chunks * chunk_length_ms + end_time = audio_length_ms + chunk = audio[start_time:end_time] + executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback( + lambda x: pbar.update(1) + ) + + # Close progress bar + pbar.close() + + +def main(): + # Set up argument parser for the CLI app + parser = argparse.ArgumentParser(description="Split an audio file into equally sized chunks.") + parser.add_argument("input_file", help="Path to the input audio file.") + parser.add_argument("output_dir", help="Path to the output directory where chunks will be saved.") + parser.add_argument( + "--chunk_length", + type=int, + default=12000, + help="Length of each chunk in milliseconds (default: 300000 ms / 5 minutes).", + ) + parser.add_argument( + "--output_format", + type=str, + default="wav", + help="Output format for the audio chunks (default: wav). Supported formats include wav, mp3, and ogg.", + ) + parser.add_argument( + "--silence_based", + action="store_true", + help="Split the audio based on silence instead of fixed-size chunks. If set, --chunk_length is ignored.", + ) + parser.add_argument( + "--silence_threshold", type=int, default=-40, help="Threshold in dB for silence based splitting." + ) + parser.add_argument( + "--silence_min_len", type=int, default=400, help="Minimum length of silence in milliseconds for splitting." + ) + + # Parse the arguments + args = parser.parse_args() + + # Call the split_audio function with the provided arguments + split_audio( + args.input_file, + args.output_dir, + args.chunk_length, + args.output_format, + args.silence_based, + args.silence_threshold, + args.silence_min_len, + ) + + +if __name__ == "__main__": + main() diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..fd6f923 --- /dev/null +++ b/tasks.py @@ -0,0 +1,52 @@ +import os + +from invoke import task + +IMAGE_RESPOSITORY = os.environ.get("IMAGE_RESPOSITORY", "registry.runcible.io/split_it") + + +@task +def update_deps(c): + """Updates depenencies""" + c.run("pip-compile requirements.in", pty=True) + c.run("pip-compile dev-requirements.in", pty=True) + + +@task +def sync_deps(c): + """Syncs local dependencies""" + c.run("pip-sync requirements.txt dev-requirements.txt") + + +@task +def lint(c): + """Runs all linters against the project.""" + c.run("./scripts/run_linters.sh", pty=True) + + +@task +def delint(c): + """Applies automated linters to project""" + c.run("isort ./split_it ./tests ./tasks.py", pty=True) + c.run("black ./split_it ./tests ./tasks.py", pty=True) + + +@task +def build(c): + """Builds the project as a Python package.""" + c.run("python3 -m build") + + +@task +def build_image(c, dev=True, registry_user=None, registry_token=None, push=False, login=False): + """Builds the split_it container image.""" + context_dir = c.run("pwd", hide=True).stdout.strip() + commit_sha = c.run("git rev-parse --short HEAD", hide=True).stdout.strip() + image_name = f"{IMAGE_RESPOSITORY}:{commit_sha}{'-dev' if dev else ''}" + c.run(f"docker build --build-arg='COMMIT_SHA={commit_sha}' -t {image_name} {context_dir}") + if login: + if registry_user is None or registry_token is None: + raise ValueError("--registry_user and --registry_token must be provided if using --login parameter") + c.run(f"docker login -u {registry_user} -p {registry_token}") + if push: + c.run(f"docker push {image_name}") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_example.py b/tests/test_example.py new file mode 100644 index 0000000..0d72818 --- /dev/null +++ b/tests/test_example.py @@ -0,0 +1,6 @@ +from split_it.example import Example + + +def test_example(): + my_example = Example(name="dirp") + assert my_example.name == "dirp"