Merge pull request #104 from descriptinc/angela/ed-19914-studio-sound…

…-is-ignoring-start_pts-0027-seconds [Studio Sound] Ignore start_pts only for mp3 files
descriptinc · Feb 1, 2024 · a1a2c86 · a1a2c86
2 parents 2638236 + 831b42b
commit a1a2c86
Show file tree

Hide file tree

Showing 8 changed files with 35 additions and 23 deletions.
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install ffmpeg
-          sudo apt-get install libsndfile1-dev
+          sudo apt-get install libsndfile1-dev libsox-dev libsox-fmt-mp3
           python -m pip install --upgrade pip
           pip install wheel
       - name: Install recent FFMPEG

diff --git a/audiotools/core/ffmpeg.py b/audiotools/core/ffmpeg.py
@@ -3,6 +3,7 @@
 import subprocess
 import tempfile
 from pathlib import Path
+from typing import Tuple
 
 import ffmpy
 import numpy as np
@@ -61,20 +62,26 @@ def r128stats(filepath: str, quiet: bool):
     return stats_dict
 
 
-def ffprobe_offset(path):
+def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
+    """Given a path to a file, returns the start time offset and codec of
+    the first audio stream.
+    """
     ff = ffmpy.FFprobe(
         inputs={path: None},
-        global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,start_pts,time_base -of json -v quiet",
+        global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
     )
     streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
     seconds_offset = 0.0
-    # Get the offset of the first audio stream we find
+    codec = None
+
+    # Get the offset and codec of the first audio stream we find
     # and return its start time, if it has one.
     for stream in streams:
         if stream["codec_type"] == "audio":
             seconds_offset = stream.get("start_time", 0.0)
+            codec = stream.get("codec_name")
             break
-    return float(seconds_offset)
+    return float(seconds_offset), codec
 
 
 class FFMPEGMixin:
@@ -174,17 +181,16 @@ def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwarg
             )
             ff.run()
 
-            # We pad the file using the start time offset
-            # in case it's an audio stream starting at some
-            # offset in a video container.
-            pad = ffprobe_offset(audio_path)
-            # Don't pad files with discrepancies less than
-            # 0.027s - it's likely due to codec latency.
-            # The amount of latency introduced by mp3 is
-            # 1152, which is 0.0261 44khz. So we
-            # set the threshold here slightly above that.
+            # We pad the file using the start time offset in case it's an audio
+            # stream starting at some offset in a video container.
+            pad, codec = ffprobe_offset_and_codec(audio_path)
+
+            # For mp3s, don't pad files with discrepancies less than 0.027s -
+            # it's likely due to codec latency. The amount of latency introduced
+            # by mp3 is 1152, which is 0.0261 44khz. So we set the threshold
+            # here slightly above that.
             # Source: https://lame.sourceforge.io/tech-FAQ.txt.
-            if pad < 0.027:
+            if codec == "mp3" and pad < 0.027:
                 pad = 0.0
             ff = ffmpy.FFmpeg(
                 inputs={wav_file: None},

diff --git a/audiotools/core/playback.py b/audiotools/core/playback.py
@@ -2,7 +2,7 @@
 These are utilities that allow one to embed an AudioSignal
 as a playable object in a Jupyter notebook, or to play audio from
 the terminal, etc.
-"""
+"""  # fmt: skip
 import base64
 import io
 import random

diff --git a/audiotools/metrics/__init__.py b/audiotools/metrics/__init__.py
@@ -1,6 +1,6 @@
 """
 Functions for comparing AudioSignal objects to one another.
-"""
+"""  # fmt: skip
 from . import distance
 from . import quality
 from . import spectral
diff --git a/audiotools/ml/experiment.py b/audiotools/ml/experiment.py
@@ -1,7 +1,7 @@
 """
 Useful class for Experiment tracking, and ensuring code is
 saved alongside files.
-"""
+"""  # fmt: skip
 import datetime
 import os
 import shlex

diff --git a/examples/abx.py b/examples/abx.py
@@ -98,6 +98,6 @@ def build(user, samples, rating):
         fn=build,
         inputs=[user, samples, rating],
         outputs=player.to_list() + [rating, begin, samples, progress],
-    ).then(None, _js=pr.reset_player)
+    ).then(None, js=pr.reset_player)
 
     app.launch()
diff --git a/setup.py b/setup.py
@@ -46,7 +46,7 @@
         "ffmpy",
         "ipython",
         "rich",
-        "matplotlib",
+        "matplotlib==3.5",  # See https://github.com/librosa/librosa/issues/1763#issuecomment-1742120524
         "librosa",
         "pystoi",
         "torch_stoi",
@@ -65,7 +65,7 @@
             "pytest-cov",
             "line_profiler",
             "pesq",
-            "gradio>=3.32.0",
+            "gradio==3.32.0",
             "transformers>=4.23.1",
         ],
         "docs": [

diff --git a/tests/core/test_grad.py b/tests/core/test_grad.py
@@ -27,9 +27,15 @@ def _test_audio_grad(attr: str, target=True, kwargs: dict = {}):
                 # If necessary, propagate spectrogram changes to waveform
                 if result.stft_data is not None:
                     result.istft()
-                result.audio_data.sum().backward()
+                if result.audio_data.dtype.is_complex:
+                    result.audio_data.real().sum().backward()
+                else:
+                    result.audio_data.sum().backward()
             else:
-                result.sum().backward()
+                if result.dtype.is_complex:
+                    result.real.sum().backward()
+                else:
+                    result.sum().backward()
 
             assert signal.audio_data.grad is not None or not target
         except RuntimeError: