Skip to content

Commit

Permalink
Merge pull request #104 from descriptinc/angela/ed-19914-studio-sound…
Browse files Browse the repository at this point in the history
…-is-ignoring-start_pts-0027-seconds

[Studio Sound] Ignore start_pts only for mp3 files
  • Loading branch information
anjoola committed Feb 1, 2024
2 parents 2638236 + 831b42b commit a1a2c86
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install ffmpeg
sudo apt-get install libsndfile1-dev
sudo apt-get install libsndfile1-dev libsox-dev libsox-fmt-mp3
python -m pip install --upgrade pip
pip install wheel
- name: Install recent FFMPEG
Expand Down
34 changes: 20 additions & 14 deletions audiotools/core/ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple

import ffmpy
import numpy as np
Expand Down Expand Up @@ -61,20 +62,26 @@ def r128stats(filepath: str, quiet: bool):
return stats_dict


def ffprobe_offset(path):
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
"""Given a path to a file, returns the start time offset and codec of
the first audio stream.
"""
ff = ffmpy.FFprobe(
inputs={path: None},
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,start_pts,time_base -of json -v quiet",
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
)
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
seconds_offset = 0.0
# Get the offset of the first audio stream we find
codec = None

# Get the offset and codec of the first audio stream we find
# and return its start time, if it has one.
for stream in streams:
if stream["codec_type"] == "audio":
seconds_offset = stream.get("start_time", 0.0)
codec = stream.get("codec_name")
break
return float(seconds_offset)
return float(seconds_offset), codec


class FFMPEGMixin:
Expand Down Expand Up @@ -174,17 +181,16 @@ def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwarg
)
ff.run()

# We pad the file using the start time offset
# in case it's an audio stream starting at some
# offset in a video container.
pad = ffprobe_offset(audio_path)
# Don't pad files with discrepancies less than
# 0.027s - it's likely due to codec latency.
# The amount of latency introduced by mp3 is
# 1152, which is 0.0261 44khz. So we
# set the threshold here slightly above that.
# We pad the file using the start time offset in case it's an audio
# stream starting at some offset in a video container.
pad, codec = ffprobe_offset_and_codec(audio_path)

# For mp3s, don't pad files with discrepancies less than 0.027s -
# it's likely due to codec latency. The amount of latency introduced
# by mp3 is 1152, which is 0.0261 44khz. So we set the threshold
# here slightly above that.
# Source: https://lame.sourceforge.io/tech-FAQ.txt.
if pad < 0.027:
if codec == "mp3" and pad < 0.027:
pad = 0.0
ff = ffmpy.FFmpeg(
inputs={wav_file: None},
Expand Down
2 changes: 1 addition & 1 deletion audiotools/core/playback.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
These are utilities that allow one to embed an AudioSignal
as a playable object in a Jupyter notebook, or to play audio from
the terminal, etc.
"""
""" # fmt: skip
import base64
import io
import random
Expand Down
2 changes: 1 addition & 1 deletion audiotools/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Functions for comparing AudioSignal objects to one another.
"""
""" # fmt: skip
from . import distance
from . import quality
from . import spectral
2 changes: 1 addition & 1 deletion audiotools/ml/experiment.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Useful class for Experiment tracking, and ensuring code is
saved alongside files.
"""
""" # fmt: skip
import datetime
import os
import shlex
Expand Down
2 changes: 1 addition & 1 deletion examples/abx.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ def build(user, samples, rating):
fn=build,
inputs=[user, samples, rating],
outputs=player.to_list() + [rating, begin, samples, progress],
).then(None, _js=pr.reset_player)
).then(None, js=pr.reset_player)

app.launch()
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"ffmpy",
"ipython",
"rich",
"matplotlib",
"matplotlib==3.5", # See https://github.com/librosa/librosa/issues/1763#issuecomment-1742120524
"librosa",
"pystoi",
"torch_stoi",
Expand All @@ -65,7 +65,7 @@
"pytest-cov",
"line_profiler",
"pesq",
"gradio>=3.32.0",
"gradio==3.32.0",
"transformers>=4.23.1",
],
"docs": [
Expand Down
10 changes: 8 additions & 2 deletions tests/core/test_grad.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,15 @@ def _test_audio_grad(attr: str, target=True, kwargs: dict = {}):
# If necessary, propagate spectrogram changes to waveform
if result.stft_data is not None:
result.istft()
result.audio_data.sum().backward()
if result.audio_data.dtype.is_complex:
result.audio_data.real().sum().backward()
else:
result.audio_data.sum().backward()
else:
result.sum().backward()
if result.dtype.is_complex:
result.real.sum().backward()
else:
result.sum().backward()

assert signal.audio_data.grad is not None or not target
except RuntimeError:
Expand Down

0 comments on commit a1a2c86

Please sign in to comment.