Hello split-it

2 years ago · 4e7cd1b366
commit 4e7cd1b366
22 changed files with 657 additions and 0 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,6 @@
+[run]
+source = split_it
+omit = test*
+
+[report]
+show_missing = True
--- a/.drone.yml
+++ b/.drone.yml
@ -0,0 +1,44 @@
+kind: pipeline
+type: docker
+name: CI Test/Lint Pipeline
+
+steps:
+- name: Unit Tests and Linters
+  # Bullseye because drone runner host OS is using older libseccomp2 causing issues
+  # with thread allocation. See: https://github.com/docker-library/python/issues/835
+  image: python:3.11-bullseye
+  commands:
+   - bash -xc './scripts/run_linters.sh'
+   - bash -xc './scripts/run_unit_tests.sh'
+  group: test-lint
+
+trigger:
+  event:
+    - pull_request
+    - push
+
+# Secrets used to pull private images
+image_pull_secrets:
+  - dockerconfigjson
+
+---
+kind: pipeline
+type: docker
+name: Build Production Image
+steps:
+- name: Build split_it Container Image
+  image: plugins/docker
+  settings:
+    username: automate
+    password:
+      from_secret: automate_password
+    dockerfile: Dockerfile
+    registry: registry.runcible.io
+    repo: registry.runcible.io/split_it
+    tags:
+      - ${DRONE_COMMIT_SHA}
+  when:
+    branch:
+      - master
+    event:
+      - push
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,62 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Pycharm
+.idea
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,40 @@
+default_stages: [commit, push]
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-added-large-files
+      - id: debug-statements
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.9.0.5
+    hooks:
+      - id: shellcheck
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+      - id: black
+        args: ["split_it", "./tests/"]
+        types: [ python ]
+  - repo: local
+    hooks:
+      - id: isort
+        name: isort
+        entry: isort
+        language: system
+        types: [python]
+        require_serial: true
+      - id: ruff
+        name: ruff
+        entry: ruff
+        language: system
+        types: [python]
+        args: [
+            # '-rn', # Only display messages
+            # '-sn', # Don't display the score
+            # '--disable=C,R,W0511', # Disable C and R type messages, and TODO fixme warning
+          ]
+        require_serial: true
--- a/10
+++ b/10
@ -0,0 +1,10 @@
+
+MIT License
+
+Copyright (c) 2024, Drew Bednar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,26 @@
+# split_it
+
+Tool for Splitting Audio
+
+## First time setup
+
+Create a virtual environment and activate it. Now from the root project directory run `./scripts/bootstrap`. This will install `pip-tools` and sync any dependencies for the first time.
+
+## Dependency management
+
+Dependencies are managed via [pip-tools].
+
+### Adding a dependency
+
+To add a dependency, edit `requirements.in` (or `dev-requirements.in`
+for dev dependencies) and add your dependency then run `pip-compile
+requirements.in`.
+
+### Syncing dependencies
+
+Run `pip-sync requirements.txt dev_requirements.txt`.
+
+## Testing
+
+Run the tests by invoking `py.test` in the project root.  Make sure you
+run any pending migrations beforehand.
--- a/dev-requirements.in
+++ b/dev-requirements.in
@ -0,0 +1,12 @@
+-c ./requirements.txt
+
+black
+build
+invoke
+isort<=5.12.0
+pip-tools
+pre-commit
+pytest
+pytest-cov
+shellcheck-py==0.9.0.5
+ruff
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@ -0,0 +1,89 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile dev-requirements.in
+#
+black==24.1.1
+    # via -r dev-requirements.in
+build==1.0.3
+    # via
+    #   -r dev-requirements.in
+    #   pip-tools
+cfgv==3.4.0
+    # via pre-commit
+click==8.1.7
+    # via
+    #   black
+    #   pip-tools
+coverage[toml]==7.4.1
+    # via
+    #   coverage
+    #   pytest-cov
+distlib==0.3.8
+    # via virtualenv
+exceptiongroup==1.2.0
+    # via pytest
+filelock==3.13.1
+    # via virtualenv
+identify==2.5.33
+    # via pre-commit
+iniconfig==2.0.0
+    # via pytest
+invoke==2.2.0
+    # via -r dev-requirements.in
+isort==5.12.0
+    # via -r dev-requirements.in
+mypy-extensions==1.0.0
+    # via black
+nodeenv==1.8.0
+    # via pre-commit
+packaging==23.2
+    # via
+    #   black
+    #   build
+    #   pytest
+pathspec==0.12.1
+    # via black
+pip-tools==7.3.0
+    # via -r dev-requirements.in
+platformdirs==4.2.0
+    # via
+    #   black
+    #   virtualenv
+pluggy==1.4.0
+    # via pytest
+pre-commit==3.6.0
+    # via -r dev-requirements.in
+pyproject-hooks==1.0.0
+    # via build
+pytest==8.0.0
+    # via
+    #   -r dev-requirements.in
+    #   pytest-cov
+pytest-cov==4.1.0
+    # via -r dev-requirements.in
+pyyaml==6.0.1
+    # via pre-commit
+ruff==0.2.0
+    # via -r dev-requirements.in
+shellcheck-py==0.9.0.5
+    # via -r dev-requirements.in
+tomli==2.0.1
+    # via
+    #   black
+    #   build
+    #   coverage
+    #   pip-tools
+    #   pyproject-hooks
+    #   pytest
+typing-extensions==4.9.0
+    # via black
+virtualenv==20.25.0
+    # via pre-commit
+wheel==0.42.0
+    # via pip-tools
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Best practice: Bash strict mode.
+set -euo pipefail
+
+# Best practice: Make sure the image shuts down correctly by using `exec` in
+# entry point shell scripts.
+exec "$@"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,59 @@
+[project]
+name = "split_it"
+version = "0.1.0"
+authors = [
+    {name = "Drew Bednar", email = "drew@runcible.io"},
+]
+description = "Tool for Splitting Audio"
+requires-python = ">=3.10"
+license = {text = "MIT"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+]
+dynamic = ["readme", "dependencies"]
+
+[tool.setuptools.dynamic]
+readme = {file = ["README.md"], content-type = "text/markdown"}
+dependencies = {file = ["requirements.txt"]}
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
+line_length = 120
+force_single_line = true
+filter_files = true
+
+[tool.black]
+line-length = 120
+skip-string-normalization = true
+exclude = "(^/\\.git|^/env/|^/venv/|^/node_modules/)"
+
+[tool.ruff]
+line-length = 120
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
--- a/requirements.in
+++ b/requirements.in
@ -0,0 +1,2 @@
+pydub
+tqdm
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+pydub==0.25.1
+    # via -r requirements.in
+tqdm==4.66.1
+    # via -r requirements.in
--- a/scripts/_common.sh
+++ b/scripts/_common.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+function _setup_env() {
+  # build a virtual env
+  if [ -z "${VIRTUAL_ENV}" ]; then
+  	  python3 -m venv env
+      # shellcheck source=/dev/null
+  	  source env/bin/activate
+  fi
+}
--- a/scripts/bootstrap.sh
+++ b/scripts/bootstrap.sh
@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# setting -e to exit immediately on a command failure.
+# set -o pipefail sets the exit code of a pipeline to
+# that of the rightmost command to exit with a non-zero status,
+# or to zero if all commands of the pipeline exit successfully.
+# set -u affects variables. When set, a reference to any variable
+# you haven't previously defined - with the exceptions of $* and $@ -
+# is an error, and causes the program to immediately exit
+set -eo pipefail
+IFS=$'\n\t'
+
+# shellcheck source=/dev/null
+source "$(dirname "$0")/_common.sh"
+
+function project_bootstrap() {
+    _setup_env
+    pip install -U pip pip-tools=='7.3.0'
+
+    # Check if requirements.txt exists, if not, compile it from requirements.in
+    if [ ! -f requirements.txt ]; then
+        pip-compile requirements.in
+    fi
+
+    # Check if dev-requirements.txt exists, if not, compile it from dev-requirements.in
+    if [ ! -f dev-requirements.txt ]; then
+        pip-compile dev-requirements.in
+    fi
+
+    pip-sync requirements.txt dev-requirements.txt
+}
+
+function install_precommit_hooks() {
+    if [ ! -d ".git" ]; then
+        git init
+    fi
+
+    if [ -z "$VIRTUAL_ENV" ]; then
+    echo "warning: you are not in a virtualenv"
+    exit 1
+    fi
+
+    pre-commit install
+}
+
+project_bootstrap
+install_precommit_hooks
--- a/scripts/run_linters.sh
+++ b/scripts/run_linters.sh
@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+# shellcheck source=/dev/null
+source "$(dirname "$0")/_common.sh"
+
+if [ "${DRONE}" == "true" ]; then
+    _setup_env
+    pip install -r requirements.txt -r dev-requirements.txt
+fi
+
+# Run linting commands and capture their return codes
+"${VIRTUAL_ENV}/bin/python3" -m isort --check ./speech_collect ./tests ./tasks.py
+ISORT_EXIT_CODE=$?
+
+"${VIRTUAL_ENV}/bin/python3" -m black --check ./speech_collect ./tests ./tasks.py
+BLACK_EXIT_CODE=$?
+
+"${VIRTUAL_ENV}/bin/python3" -m ruff ./speech_collect ./tests ./tasks.py
+RUFF_EXIT_CODE=$?
+
+# Check if any linting command failed
+if [ $ISORT_EXIT_CODE -ne 0 ] || [ $BLACK_EXIT_CODE -ne 0 ] || [ $RUFF_EXIT_CODE -ne 0 ]; then
+    echo "Some linting checks failed"
+    # Exit with a non-zero status, you can choose which error code to return
+    exit 1
+fi
+
+echo "All linting checks passed"
--- a/scripts/run_unit_tests.sh
+++ b/scripts/run_unit_tests.sh
@ -0,0 +1,12 @@
+#! /usr/bin/env bash
+set -ex
+
+# shellcheck source=/dev/null
+source "$(dirname "$0")/_common.sh"
+
+if [ "${DRONE}" == "true" ]; then
+    _setup_env
+    pip install -r requirements.txt -r dev-requirements.txt
+fi
+
+exec "${VIRTUAL_ENV}/bin/python3" -m pytest -vv --cov "$@"
--- a/split_it/init.py
+++ b/split_it/init.py
--- a/split_it/cli.py
+++ b/split_it/cli.py
@ -0,0 +1,134 @@
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from tqdm import tqdm
+
+
+def save_chunk(chunk, start_time, output_dir, output_format):
+    chunk.export(os.path.join(output_dir, f'chunk_{start_time}.{output_format}'), format=output_format)
+
+
+def merge_short_chunks(chunks, min_chunk_length_ms):
+    merged_chunks = []
+    current_chunk = chunks[0]
+
+    for chunk in chunks[1:]:
+        if len(current_chunk) + len(chunk) < min_chunk_length_ms:
+            current_chunk += chunk
+        else:
+            merged_chunks.append(current_chunk)
+            current_chunk = chunk
+
+    merged_chunks.append(current_chunk)
+    return merged_chunks
+
+
+def split_audio(
+    input_file, output_dir, chunk_length_ms, output_format, silence_based, silence_threshold, silence_min_len
+):
+    # Load the input audio file using Pydub
+    audio = AudioSegment.from_file(input_file)
+
+    # Create the output directory if it doesn't exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if silence_based:
+        # Split the audio file based on silence
+        min_silence_len = silence_min_len
+        silence_thresh = silence_threshold
+        chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
+
+        # Merge adjacent chunks shorter than the specified length
+        chunks = merge_short_chunks(chunks, chunk_length_ms)
+
+        # Set up progress bar with tqdm
+        pbar = tqdm(total=len(chunks), desc="Processing chunks based on silence")
+
+        # Save chunks in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor() as executor:
+            for i, chunk in enumerate(chunks):
+                executor.submit(save_chunk, chunk, i, output_dir, output_format).add_done_callback(
+                    lambda x: pbar.update(1)
+                )
+
+    else:
+        # Calculate the total length of the audio in milliseconds and the number of full chunks
+        audio_length_ms = len(audio)
+        num_chunks = audio_length_ms // chunk_length_ms
+
+        # Set up progress bar with tqdm
+        pbar = tqdm(total=num_chunks + (audio_length_ms % chunk_length_ms != 0), desc="Processing fixed-size chunks")
+
+        # Split and save chunks in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor() as executor:
+            for i in range(num_chunks):
+                start_time = i * chunk_length_ms
+                end_time = (i + 1) * chunk_length_ms
+                chunk = audio[start_time:end_time]
+                executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback(
+                    lambda x: pbar.update(1)
+                )
+
+            # Handle the last chunk if there is any remainder
+            if audio_length_ms % chunk_length_ms != 0:
+                start_time = num_chunks * chunk_length_ms
+                end_time = audio_length_ms
+                chunk = audio[start_time:end_time]
+                executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback(
+                    lambda x: pbar.update(1)
+                )
+
+    # Close progress bar
+    pbar.close()
+
+
+def main():
+    # Set up argument parser for the CLI app
+    parser = argparse.ArgumentParser(description="Split an audio file into equally sized chunks.")
+    parser.add_argument("input_file", help="Path to the input audio file.")
+    parser.add_argument("output_dir", help="Path to the output directory where chunks will be saved.")
+    parser.add_argument(
+        "--chunk_length",
+        type=int,
+        default=12000,
+        help="Length of each chunk in milliseconds (default: 300000 ms / 5 minutes).",
+    )
+    parser.add_argument(
+        "--output_format",
+        type=str,
+        default="wav",
+        help="Output format for the audio chunks (default: wav). Supported formats include wav, mp3, and ogg.",
+    )
+    parser.add_argument(
+        "--silence_based",
+        action="store_true",
+        help="Split the audio based on silence instead of fixed-size chunks. If set, --chunk_length is ignored.",
+    )
+    parser.add_argument(
+        "--silence_threshold", type=int, default=-40, help="Threshold in dB for silence based splitting."
+    )
+    parser.add_argument(
+        "--silence_min_len", type=int, default=400, help="Minimum length of silence in milliseconds for splitting."
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Call the split_audio function with the provided arguments
+    split_audio(
+        args.input_file,
+        args.output_dir,
+        args.chunk_length,
+        args.output_format,
+        args.silence_based,
+        args.silence_threshold,
+        args.silence_min_len,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/tasks.py
+++ b/tasks.py
@ -0,0 +1,52 @@
+import os
+
+from invoke import task
+
+IMAGE_RESPOSITORY = os.environ.get("IMAGE_RESPOSITORY", "registry.runcible.io/split_it")
+
+
+@task
+def update_deps(c):
+    """Updates depenencies"""
+    c.run("pip-compile requirements.in", pty=True)
+    c.run("pip-compile dev-requirements.in", pty=True)
+
+
+@task
+def sync_deps(c):
+    """Syncs local dependencies"""
+    c.run("pip-sync requirements.txt dev-requirements.txt")
+
+
+@task
+def lint(c):
+    """Runs all linters against the project."""
+    c.run("./scripts/run_linters.sh", pty=True)
+
+
+@task
+def delint(c):
+    """Applies automated linters to project"""
+    c.run("isort ./split_it ./tests ./tasks.py", pty=True)
+    c.run("black ./split_it ./tests ./tasks.py", pty=True)
+
+
+@task
+def build(c):
+    """Builds the project as a Python package."""
+    c.run("python3 -m build")
+
+
+@task
+def build_image(c, dev=True, registry_user=None, registry_token=None, push=False, login=False):
+    """Builds the split_it container image."""
+    context_dir = c.run("pwd", hide=True).stdout.strip()
+    commit_sha = c.run("git rev-parse --short HEAD", hide=True).stdout.strip()
+    image_name = f"{IMAGE_RESPOSITORY}:{commit_sha}{'-dev' if dev else ''}"
+    c.run(f"docker build --build-arg='COMMIT_SHA={commit_sha}' -t {image_name} {context_dir}")
+    if login:
+        if registry_user is None or registry_token is None:
+            raise ValueError("--registry_user and --registry_token must be provided if using --login parameter")
+        c.run(f"docker login -u {registry_user} -p {registry_token}")
+    if push:
+        c.run(f"docker push {image_name}")
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
--- a/tests/test_example.py
+++ b/tests/test_example.py
@ -0,0 +1,6 @@
+from split_it.example import Example
+
+
+def test_example():
+    my_example = Example(name="dirp")
+    assert my_example.name == "dirp"