senselab.audio.data_structures.audio

Audio data structure module.

View Source

  1"""Audio data structure module."""
  2
  3try:
  4    import torchaudio
  5
  6    TORCHAUDIO_AVAILABLE = True
  7except ModuleNotFoundError:
  8    TORCHAUDIO_AVAILABLE = False
  9
 10try:
 11    import soundfile as sf
 12
 13    SOUNDFILE_AVAILABLE = True
 14except ModuleNotFoundError:
 15    SOUNDFILE_AVAILABLE = False
 16
 17import os
 18import uuid
 19import warnings
 20from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 21
 22import numpy as np
 23import torch
 24from pydantic import BaseModel, Field, PrivateAttr
 25
 26from senselab.utils.constants import SENSELAB_NAMESPACE
 27
 28
 29class Audio(BaseModel):
 30    """Represents an audio file and its associated metadata.
 31
 32    Users should instantiate Audio via the constructor (from file path or waveform + sampling rate)
 33    or the 'from_stream' method, which yiels Audio objects from a live audio stream.
 34
 35    Attributes:
 36        metadata: A dictionary containing any additional metadata.
 37    """
 38
 39    # Private attributes used for lazy loading and internal state.
 40    _file_path: Union[str, os.PathLike] = PrivateAttr(default="")  # Path to audio file (if not pre-loaded)
 41    _waveform: Optional[torch.Tensor] = PrivateAttr(default=None)  # Audio data (lazy-loaded from file if not provided)
 42    _sampling_rate: Optional[int] = PrivateAttr(default=None)  # Actual sampling rate; loaded on demand
 43    _offset_in_sec: float = PrivateAttr(default=0.0)  # Offset in seconds from which to start loading audio
 44    _duration_in_sec: Optional[float] = PrivateAttr(default=None)  # Duration in seconds to load; None means full file
 45    _backend: Optional[str] = PrivateAttr(default=None)  # Backend to use when loading the audio
 46
 47    # Public fields:
 48    metadata: Dict = Field(default={})
 49    model_config = {"arbitrary_types_allowed": True}
 50
 51    def __init__(self, **data: Any) -> None:  # noqa: ANN401,D417
 52        """Initialize an Audio instance.
 53
 54        Args:
 55            waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
 56            sampling_rate (optional): If provided, sets the sampling rate.
 57                This must be provided if a waveform is supplied.
 58            filepath (optional): File path for lazy loading the waveform if not provided.
 59            offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
 60            duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
 61            backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
 62            metadata (optional): A dictionary of additional metadata.
 63
 64        Raises:
 65            ValueError: If neither waveform nor filepath is provided.
 66        """
 67        waveform = data.pop("waveform", None)
 68        provided_sr = data.pop("sampling_rate", None)
 69        filepath = data.pop("filepath", None)
 70        offset_in_sec = data.pop("offset_in_sec", 0.0)
 71        duration_in_sec = data.pop("duration_in_sec", None)
 72        backend = data.pop("backend", None)
 73        metadata = data.pop("metadata", {})
 74
 75        super().__init__(**data)
 76
 77        if waveform is not None:
 78            # If a waveform and sampling rate are provided, convert and store them;
 79            if provided_sr is None:
 80                raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.")
 81            self._waveform = self.convert_to_tensor(waveform)
 82            self._sampling_rate = provided_sr
 83        else:
 84            # otherwise, a valid filepath is required for lazy loading.
 85            if not filepath:
 86                raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.")
 87            elif not os.path.exists(filepath):
 88                raise FileNotFoundError(f"File {filepath} does not exist.")
 89            else:
 90                self._file_path = filepath
 91
 92        # Validate offset
 93        if offset_in_sec < 0:
 94            raise ValueError("Offset must be a non-negative value")
 95
 96        # Validate duration (allowing -1 to indicate full file)
 97        if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1:
 98            raise ValueError("Duration must be -1 (for full file) or a positive value")
 99
100        # Validate backend if provided
101        allowed_backends = {"ffmpeg", "sox", "soundfile"}
102        if backend is not None and backend not in allowed_backends:
103            raise ValueError("Unsupported backend")
104
105        self._offset_in_sec = offset_in_sec
106        self._duration_in_sec = duration_in_sec
107        self._backend = backend
108
109        # Set the metadata
110        self.metadata = metadata
111
112    @property
113    def waveform(self) -> torch.Tensor:
114        """Returns the audio waveform as a torch.Tensor.
115
116        If the waveform has not been loaded yet, it is loaded lazily from the file.
117        """
118        if self._waveform is None:
119            # print("Lazy loading audio data from file...")
120            self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path))
121        assert self._waveform is not None, "Failed to load audio data."
122        return self._waveform
123
124    @property
125    def sampling_rate(self) -> int:
126        """Returns the sampling rate of the audio.
127
128        If the sampling rate is not set and a file is available, it is inferred from the file metadata.
129        """
130        if self._sampling_rate is None:
131            if self._file_path and TORCHAUDIO_AVAILABLE:
132                info = torchaudio.info(self._file_path)
133                self._sampling_rate = info.sample_rate
134            else:
135                raise ValueError("Sampling rate is not available.")
136        assert self._sampling_rate is not None, "Sampling rate should be set."
137        return self._sampling_rate
138
139    @classmethod
140    def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor:
141        """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).
142
143        Args:
144            v: Audio data in the form of a list, NumPy array, or torch.Tensor.
145
146        Returns:
147            A torch.Tensor representation of the audio data.
148        """
149        if isinstance(v, list):
150            temporary_tensor = torch.tensor(v)
151        elif isinstance(v, np.ndarray):
152            temporary_tensor = torch.tensor(v)
153        elif isinstance(v, torch.Tensor):
154            temporary_tensor = v.clone()
155        else:
156            raise ValueError("Unsupported data type for audio conversion.")
157
158        if temporary_tensor.ndim == 1:
159            temporary_tensor = temporary_tensor.unsqueeze(0)
160        return temporary_tensor.to(torch.float32)
161
162    def _lazy_load_data_from_filepath(self, filepath: Union[str, os.PathLike]) -> torch.Tensor:
163        """Lazy-loads audio data from the given filepath.
164
165        Converts the stored offset and duration (in seconds) to the required frame indices for torchaudio.
166
167        Args:
168            filepath: The path to the audio file.
169
170        Returns:
171            A torch.Tensor containing the loaded audio data.
172
173        Raises:
174            ModuleNotFoundError: If torchaudio is not available.
175            ValueError: If the offset or duration exceeds the file duration.
176        """
177        if not TORCHAUDIO_AVAILABLE:
178            raise ModuleNotFoundError(
179                "`torchaudio` is not installed. "
180                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
181            )
182
183        info = torchaudio.info(filepath)
184        self._sampling_rate = info.sample_rate
185        total_frames = info.num_frames
186
187        # Convert offset_in_sec and duration_in_sec to frame indices.
188        frame_offset = int(self._offset_in_sec * self.sampling_rate)
189        if frame_offset > total_frames:
190            raise ValueError(
191                f"Offset ({self._offset_in_sec} s) exceeds the audio file duration "
192                f"({total_frames / self.sampling_rate:.2f} s)."
193            )
194        if self._duration_in_sec is not None and self._duration_in_sec > 0:
195            num_frames = int(self._duration_in_sec * self.sampling_rate)
196            # Ensure we don't exceed the file length.
197            num_frames = min(num_frames, total_frames - frame_offset)
198        else:
199            num_frames = -1  # Indicates full file reading
200
201        array, _ = torchaudio.load(
202            filepath,
203            frame_offset=frame_offset,
204            num_frames=num_frames,
205            backend=self._backend,
206        )
207        return array
208
209    def filepath(self) -> Union[str, None]:
210        """Returns the file path of the audio if available."""
211        if self._file_path:
212            return str(self._file_path)
213        return None
214
215    def generate_id(self) -> str:
216        """Generates a unique identifier for the Audio.
217
218        The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.
219
220        Returns:
221            A string representing the generated unique identifier.
222        """
223        # Use the waveform property so that lazy loading is triggered if needed.
224        unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate))
225        return str(unique_hash)
226
227    def __eq__(self, other: object) -> bool:
228        """Overrides equality to compare Audio objects based on their generated identifiers.
229
230        Args:
231            other: Another object to compare.
232
233        Returns:
234            True if both Audio instances have the same generated identifier, False otherwise.
235        """
236        if isinstance(other, Audio):
237            return self.generate_id() == other.generate_id()
238        return False
239
240    def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]:
241        """Creates a sliding window generator for the audio waveform.
242
243        Each yielded Audio instance corresponds to a window of the waveform.
244
245        Args:
246            window_size: Number of samples in each window.
247            step_size: Number of samples to advance for each window.
248
249        Yields:
250            Audio: A new Audio instance representing the current window.
251        """
252        if step_size > window_size:
253            warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.")
254
255        num_samples = self.waveform.size(-1)
256        current_position = 0
257
258        while current_position < num_samples:
259            end_position = min(current_position + window_size, num_samples)
260            window_waveform = self.waveform[:, current_position:end_position]
261
262            yield Audio(
263                waveform=window_waveform,
264                sampling_rate=self.sampling_rate,
265                metadata=self.metadata,
266            )
267            current_position += step_size
268
269    def save_to_file(
270        self,
271        file_path: Union[str, os.PathLike],
272        format: Optional[str] = None,
273        encoding: Optional[str] = None,
274        bits_per_sample: Optional[int] = None,
275        buffer_size: int = 4096,
276        backend: Optional[str] = None,
277        compression: Optional[Union[float, int]] = None,
278    ) -> None:
279        """Saves the Audio object to a file using torchaudio.save.
280
281        Args:
282            file_path: Destination file path.
283            format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
284            encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
285            bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
286            buffer_size: Buffer size in bytes for processing.
287            backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
288            compression: Compression level for supported formats (e.g. mp3, flac, ogg).
289
290        Raises:
291            ModuleNotFoundError: If torchaudio is not available.
292            ValueError: If the waveform dimensions or sampling rate are invalid.
293            RuntimeError: If saving fails.
294        """
295        if not TORCHAUDIO_AVAILABLE:
296            raise ModuleNotFoundError(
297                "`torchaudio` is not installed. "
298                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
299            )
300
301        if self.waveform.ndim != 2:
302            raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).")
303        if self.sampling_rate <= 0:
304            raise ValueError("Sampling rate must be a positive integer.")
305
306        output_dir = os.path.dirname(file_path)
307        if not os.access(output_dir, os.W_OK):
308            raise RuntimeError(f"Output directory '{output_dir}' is not writable.")
309
310        try:
311            if not os.path.exists(output_dir):
312                os.makedirs(output_dir)
313            torchaudio.save(
314                uri=file_path,
315                src=self.waveform,
316                sample_rate=self.sampling_rate,
317                channels_first=True,
318                format=format,
319                encoding=encoding,
320                bits_per_sample=bits_per_sample,
321                buffer_size=buffer_size,
322                backend=backend,
323                compression=compression,
324            )
325        except Exception as e:
326            raise RuntimeError(f"Error saving audio to file: {e}") from e
327
328    @classmethod
329    def from_stream(
330        cls,
331        stream_source: Union[str, os.PathLike, bytes],
332        chunk_duration_in_sec: float = 1.0,
333        metadata: Optional[Dict] = None,
334    ) -> Generator["Audio", None, None]:
335        """Yield Audio objects from a live audio stream in fixed-duration chunks.
336
337        Args:
338            stream_source: A file path, stream, or bytes-like object.
339            chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
340            metadata: Additional metadata for each chunk.
341
342        Yields:
343            Audio objects for each chunk read from the stream.
344        """
345        if not SOUNDFILE_AVAILABLE:
346            raise ModuleNotFoundError(
347                "`soundfile` is not installed. "
348                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
349            )
350
351        if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source):
352            raise FileNotFoundError(f"File {stream_source} does not exist.")
353
354        with sf.SoundFile(stream_source, "r") as audio_file:
355            sampling_rate = audio_file.samplerate
356            chunk_frames = int(chunk_duration_in_sec * sampling_rate)
357
358            while True:
359                chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True)
360                if chunk.shape[0] == 0:
361                    break
362                yield cls(
363                    waveform=chunk.T,
364                    sampling_rate=sampling_rate,
365                    metadata=metadata if metadata else {},
366                )
367
368
369def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]:
370    """Batches a list of Audio objects into a single Tensor while preserving individual metadata.
371
372    Args:
373        audios: List of Audio objects. They should all have the same number of channels.
374                It is advised that they also share the same sampling rate when required by processing.
375
376    Returns:
377        A tuple containing:
378            - A Tensor of shape (batch_size, num_channels, num_samples),
379            - The sampling rate (as an integer if uniform, or a list otherwise),
380            - A list of each audio's metadata.
381
382    Raises:
383        RuntimeError: If the Audio objects do not share the same number of channels.
384    """
385    sampling_rates = []
386    num_channels_list = []
387    lengths = []
388    batched_audio = []
389    metadatas = []
390
391    for audio in audios:
392        sampling_rates.append(audio.sampling_rate)
393        num_channels_list.append(audio.waveform.shape[0])
394        lengths.append(audio.waveform.shape[1])
395        metadatas.append(audio.metadata)
396
397    if len(set(num_channels_list)) != 1:
398        raise RuntimeError("All audios must have the same number of channels.")
399
400    if len(set(sampling_rates)) != 1:
401        warnings.warn("Not all sampling rates are the same.", UserWarning)
402
403    max_length = max(lengths)
404    for audio in audios:
405        waveform = audio.waveform
406        padding = max_length - waveform.shape[1]
407        if padding > 0:
408            pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype)
409            waveform = torch.cat([waveform, pad], dim=1)
410        batched_audio.append(waveform)
411
412    return_sampling_rate: Union[int, List[int]] = (
413        int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates
414    )
415
416    return torch.stack(batched_audio), return_sampling_rate, metadatas
417
418
419def unbatch_audios(
420    batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]
421) -> List[Audio]:
422    """Unbatches a Tensor of audio data back into a list of Audio objects.
423
424    Args:
425        batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
426        sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
427        metadatas: A list of metadata dictionaries for each audio.
428
429    Returns:
430        A list of Audio objects reconstituted from the batched data.
431
432    Raises:
433        ValueError: If the batched_audio shape is invalid or if the number of items mismatches.
434    """
435    if len(batched_audio.shape) != 3:
436        raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).")
437    if batched_audio.shape[0] != len(metadatas) or (
438        isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates)
439    ):
440        raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.")
441
442    audios = []
443    for i in range(len(metadatas)):
444        sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates
445        audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i]))
446    return audios

def batch_audios( audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]: View Source

370def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]:
371    """Batches a list of Audio objects into a single Tensor while preserving individual metadata.
372
373    Args:
374        audios: List of Audio objects. They should all have the same number of channels.
375                It is advised that they also share the same sampling rate when required by processing.
376
377    Returns:
378        A tuple containing:
379            - A Tensor of shape (batch_size, num_channels, num_samples),
380            - The sampling rate (as an integer if uniform, or a list otherwise),
381            - A list of each audio's metadata.
382
383    Raises:
384        RuntimeError: If the Audio objects do not share the same number of channels.
385    """
386    sampling_rates = []
387    num_channels_list = []
388    lengths = []
389    batched_audio = []
390    metadatas = []
391
392    for audio in audios:
393        sampling_rates.append(audio.sampling_rate)
394        num_channels_list.append(audio.waveform.shape[0])
395        lengths.append(audio.waveform.shape[1])
396        metadatas.append(audio.metadata)
397
398    if len(set(num_channels_list)) != 1:
399        raise RuntimeError("All audios must have the same number of channels.")
400
401    if len(set(sampling_rates)) != 1:
402        warnings.warn("Not all sampling rates are the same.", UserWarning)
403
404    max_length = max(lengths)
405    for audio in audios:
406        waveform = audio.waveform
407        padding = max_length - waveform.shape[1]
408        if padding > 0:
409            pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype)
410            waveform = torch.cat([waveform, pad], dim=1)
411        batched_audio.append(waveform)
412
413    return_sampling_rate: Union[int, List[int]] = (
414        int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates
415    )
416
417    return torch.stack(batched_audio), return_sampling_rate, metadatas

Batches a list of Audio objects into a single Tensor while preserving individual metadata.

Arguments:

audios: List of Audio objects. They should all have the same number of channels. It is advised that they also share the same sampling rate when required by processing.

Returns:

A tuple containing: - A Tensor of shape (batch_size, num_channels, num_samples), - The sampling rate (as an integer if uniform, or a list otherwise), - A list of each audio's metadata.

Raises:

RuntimeError: If the Audio objects do not share the same number of channels.

def unbatch_audios( batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]) -> List[Audio]: View Source

420def unbatch_audios(
421    batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]
422) -> List[Audio]:
423    """Unbatches a Tensor of audio data back into a list of Audio objects.
424
425    Args:
426        batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
427        sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
428        metadatas: A list of metadata dictionaries for each audio.
429
430    Returns:
431        A list of Audio objects reconstituted from the batched data.
432
433    Raises:
434        ValueError: If the batched_audio shape is invalid or if the number of items mismatches.
435    """
436    if len(batched_audio.shape) != 3:
437        raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).")
438    if batched_audio.shape[0] != len(metadatas) or (
439        isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates)
440    ):
441        raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.")
442
443    audios = []
444    for i in range(len(metadatas)):
445        sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates
446        audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i]))
447    return audios

Unbatches a Tensor of audio data back into a list of Audio objects.

Arguments:

batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
metadatas: A list of metadata dictionaries for each audio.

Returns:

A list of Audio objects reconstituted from the batched data.

Raises:

ValueError: If the batched_audio shape is invalid or if the number of items mismatches.