senselab.audio.data_structures.audio

Audio data structure module.

  1"""Audio data structure module."""
  2
  3try:
  4    import torchaudio
  5
  6    TORCHAUDIO_AVAILABLE = True
  7except ModuleNotFoundError:
  8    TORCHAUDIO_AVAILABLE = False
  9
 10try:
 11    import soundfile as sf
 12
 13    SOUNDFILE_AVAILABLE = True
 14except ModuleNotFoundError:
 15    SOUNDFILE_AVAILABLE = False
 16
 17import os
 18import uuid
 19import warnings
 20from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 21
 22import numpy as np
 23import torch
 24from pydantic import BaseModel, Field, PrivateAttr
 25
 26from senselab.utils.constants import SENSELAB_NAMESPACE
 27
 28
 29class Audio(BaseModel):
 30    """Represents an audio file and its associated metadata.
 31
 32    Users should instantiate Audio via the constructor (from file path or waveform + sampling rate)
 33    or the 'from_stream' method, which yiels Audio objects from a live audio stream.
 34
 35    Attributes:
 36        metadata: A dictionary containing any additional metadata.
 37    """
 38
 39    # Private attributes used for lazy loading and internal state.
 40    _file_path: Union[str, os.PathLike] = PrivateAttr(default="")  # Path to audio file (if not pre-loaded)
 41    _waveform: Optional[torch.Tensor] = PrivateAttr(default=None)  # Audio data (lazy-loaded from file if not provided)
 42    _sampling_rate: Optional[int] = PrivateAttr(default=None)  # Actual sampling rate; loaded on demand
 43    _offset_in_sec: float = PrivateAttr(default=0.0)  # Offset in seconds from which to start loading audio
 44    _duration_in_sec: Optional[float] = PrivateAttr(default=None)  # Duration in seconds to load; None means full file
 45    _backend: Optional[str] = PrivateAttr(default=None)  # Backend to use when loading the audio
 46
 47    # Public fields:
 48    metadata: Dict = Field(default={})
 49    model_config = {"arbitrary_types_allowed": True}
 50
 51    def __init__(self, **data: Any) -> None:  # noqa: ANN401,D417
 52        """Initialize an Audio instance.
 53
 54        Args:
 55            waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
 56            sampling_rate (optional): If provided, sets the sampling rate.
 57                This must be provided if a waveform is supplied.
 58            filepath (optional): File path for lazy loading the waveform if not provided.
 59            offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
 60            duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
 61            backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
 62            metadata (optional): A dictionary of additional metadata.
 63
 64        Raises:
 65            ValueError: If neither waveform nor filepath is provided.
 66        """
 67        waveform = data.pop("waveform", None)
 68        provided_sr = data.pop("sampling_rate", None)
 69        filepath = data.pop("filepath", None)
 70        offset_in_sec = data.pop("offset_in_sec", 0.0)
 71        duration_in_sec = data.pop("duration_in_sec", None)
 72        backend = data.pop("backend", None)
 73        metadata = data.pop("metadata", {})
 74
 75        super().__init__(**data)
 76
 77        if waveform is not None:
 78            # If a waveform and sampling rate are provided, convert and store them;
 79            if provided_sr is None:
 80                raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.")
 81            self._waveform = self.convert_to_tensor(waveform)
 82            self._sampling_rate = provided_sr
 83        else:
 84            # otherwise, a valid filepath is required for lazy loading.
 85            if not filepath:
 86                raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.")
 87            elif not os.path.exists(filepath):
 88                raise FileNotFoundError(f"File {filepath} does not exist.")
 89            else:
 90                self._file_path = filepath
 91
 92        # Validate offset
 93        if offset_in_sec < 0:
 94            raise ValueError("Offset must be a non-negative value")
 95
 96        # Validate duration (allowing -1 to indicate full file)
 97        if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1:
 98            raise ValueError("Duration must be -1 (for full file) or a positive value")
 99
100        # Validate backend if provided
101        allowed_backends = {"ffmpeg", "sox", "soundfile"}
102        if backend is not None and backend not in allowed_backends:
103            raise ValueError("Unsupported backend")
104
105        self._offset_in_sec = offset_in_sec
106        self._duration_in_sec = duration_in_sec
107        self._backend = backend
108
109        # Set the metadata
110        self.metadata = metadata
111
112    @property
113    def waveform(self) -> torch.Tensor:
114        """Returns the audio waveform as a torch.Tensor.
115
116        If the waveform has not been loaded yet, it is loaded lazily from the file.
117        """
118        if self._waveform is None:
119            # print("Lazy loading audio data from file...")
120            self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path))
121        assert self._waveform is not None, "Failed to load audio data."
122        return self._waveform
123
124    @property
125    def sampling_rate(self) -> int:
126        """Returns the sampling rate of the audio.
127
128        If the sampling rate is not set and a file is available, it is inferred from the file metadata.
129        """
130        if self._sampling_rate is None:
131            if self._file_path and TORCHAUDIO_AVAILABLE:
132                info = torchaudio.info(self._file_path)
133                self._sampling_rate = info.sample_rate
134            else:
135                raise ValueError("Sampling rate is not available.")
136        assert self._sampling_rate is not None, "Sampling rate should be set."
137        return self._sampling_rate
138
139    @classmethod
140    def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor:
141        """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).
142
143        Args:
144            v: Audio data in the form of a list, NumPy array, or torch.Tensor.
145
146        Returns:
147            A torch.Tensor representation of the audio data.
148        """
149        if isinstance(v, list):
150            temporary_tensor = torch.tensor(v)
151        elif isinstance(v, np.ndarray):
152            temporary_tensor = torch.tensor(v)
153        elif isinstance(v, torch.Tensor):
154            temporary_tensor = v.clone()
155        else:
156            raise ValueError("Unsupported data type for audio conversion.")
157
158        if temporary_tensor.ndim == 1:
159            temporary_tensor = temporary_tensor.unsqueeze(0)
160        return temporary_tensor.to(torch.float32)
161
162    def _lazy_load_data_from_filepath(self, filepath: Union[str, os.PathLike]) -> torch.Tensor:
163        """Lazy-loads audio data from the given filepath.
164
165        Converts the stored offset and duration (in seconds) to the required frame indices for torchaudio.
166
167        Args:
168            filepath: The path to the audio file.
169
170        Returns:
171            A torch.Tensor containing the loaded audio data.
172
173        Raises:
174            ModuleNotFoundError: If torchaudio is not available.
175            ValueError: If the offset or duration exceeds the file duration.
176        """
177        if not TORCHAUDIO_AVAILABLE:
178            raise ModuleNotFoundError(
179                "`torchaudio` is not installed. "
180                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
181            )
182
183        info = torchaudio.info(filepath)
184        self._sampling_rate = info.sample_rate
185        total_frames = info.num_frames
186
187        # Convert offset_in_sec and duration_in_sec to frame indices.
188        frame_offset = int(self._offset_in_sec * self.sampling_rate)
189        if frame_offset > total_frames:
190            raise ValueError(
191                f"Offset ({self._offset_in_sec} s) exceeds the audio file duration "
192                f"({total_frames / self.sampling_rate:.2f} s)."
193            )
194        if self._duration_in_sec is not None and self._duration_in_sec > 0:
195            num_frames = int(self._duration_in_sec * self.sampling_rate)
196            # Ensure we don't exceed the file length.
197            num_frames = min(num_frames, total_frames - frame_offset)
198        else:
199            num_frames = -1  # Indicates full file reading
200
201        array, _ = torchaudio.load(
202            filepath,
203            frame_offset=frame_offset,
204            num_frames=num_frames,
205            backend=self._backend,
206        )
207        return array
208
209    def filepath(self) -> Union[str, None]:
210        """Returns the file path of the audio if available."""
211        if self._file_path:
212            return str(self._file_path)
213        return None
214
215    def generate_id(self) -> str:
216        """Generates a unique identifier for the Audio.
217
218        The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.
219
220        Returns:
221            A string representing the generated unique identifier.
222        """
223        # Use the waveform property so that lazy loading is triggered if needed.
224        unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate))
225        return str(unique_hash)
226
227    def __eq__(self, other: object) -> bool:
228        """Overrides equality to compare Audio objects based on their generated identifiers.
229
230        Args:
231            other: Another object to compare.
232
233        Returns:
234            True if both Audio instances have the same generated identifier, False otherwise.
235        """
236        if isinstance(other, Audio):
237            return self.generate_id() == other.generate_id()
238        return False
239
240    def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]:
241        """Creates a sliding window generator for the audio waveform.
242
243        Each yielded Audio instance corresponds to a window of the waveform.
244
245        Args:
246            window_size: Number of samples in each window.
247            step_size: Number of samples to advance for each window.
248
249        Yields:
250            Audio: A new Audio instance representing the current window.
251        """
252        if step_size > window_size:
253            warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.")
254
255        num_samples = self.waveform.size(-1)
256        current_position = 0
257
258        while current_position < num_samples:
259            end_position = min(current_position + window_size, num_samples)
260            window_waveform = self.waveform[:, current_position:end_position]
261
262            yield Audio(
263                waveform=window_waveform,
264                sampling_rate=self.sampling_rate,
265                metadata=self.metadata,
266            )
267            current_position += step_size
268
269    def save_to_file(
270        self,
271        file_path: Union[str, os.PathLike],
272        format: Optional[str] = None,
273        encoding: Optional[str] = None,
274        bits_per_sample: Optional[int] = None,
275        buffer_size: int = 4096,
276        backend: Optional[str] = None,
277        compression: Optional[Union[float, int]] = None,
278    ) -> None:
279        """Saves the Audio object to a file using torchaudio.save.
280
281        Args:
282            file_path: Destination file path.
283            format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
284            encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
285            bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
286            buffer_size: Buffer size in bytes for processing.
287            backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
288            compression: Compression level for supported formats (e.g. mp3, flac, ogg).
289
290        Raises:
291            ModuleNotFoundError: If torchaudio is not available.
292            ValueError: If the waveform dimensions or sampling rate are invalid.
293            RuntimeError: If saving fails.
294        """
295        if not TORCHAUDIO_AVAILABLE:
296            raise ModuleNotFoundError(
297                "`torchaudio` is not installed. "
298                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
299            )
300
301        if self.waveform.ndim != 2:
302            raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).")
303        if self.sampling_rate <= 0:
304            raise ValueError("Sampling rate must be a positive integer.")
305
306        output_dir = os.path.dirname(file_path)
307        if not os.access(output_dir, os.W_OK):
308            raise RuntimeError(f"Output directory '{output_dir}' is not writable.")
309
310        try:
311            if not os.path.exists(output_dir):
312                os.makedirs(output_dir)
313            torchaudio.save(
314                uri=file_path,
315                src=self.waveform,
316                sample_rate=self.sampling_rate,
317                channels_first=True,
318                format=format,
319                encoding=encoding,
320                bits_per_sample=bits_per_sample,
321                buffer_size=buffer_size,
322                backend=backend,
323                compression=compression,
324            )
325        except Exception as e:
326            raise RuntimeError(f"Error saving audio to file: {e}") from e
327
328    @classmethod
329    def from_stream(
330        cls,
331        stream_source: Union[str, os.PathLike, bytes],
332        chunk_duration_in_sec: float = 1.0,
333        metadata: Optional[Dict] = None,
334    ) -> Generator["Audio", None, None]:
335        """Yield Audio objects from a live audio stream in fixed-duration chunks.
336
337        Args:
338            stream_source: A file path, stream, or bytes-like object.
339            chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
340            metadata: Additional metadata for each chunk.
341
342        Yields:
343            Audio objects for each chunk read from the stream.
344        """
345        if not SOUNDFILE_AVAILABLE:
346            raise ModuleNotFoundError(
347                "`soundfile` is not installed. "
348                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
349            )
350
351        if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source):
352            raise FileNotFoundError(f"File {stream_source} does not exist.")
353
354        with sf.SoundFile(stream_source, "r") as audio_file:
355            sampling_rate = audio_file.samplerate
356            chunk_frames = int(chunk_duration_in_sec * sampling_rate)
357
358            while True:
359                chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True)
360                if chunk.shape[0] == 0:
361                    break
362                yield cls(
363                    waveform=chunk.T,
364                    sampling_rate=sampling_rate,
365                    metadata=metadata if metadata else {},
366                )
367
368
369def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]:
370    """Batches a list of Audio objects into a single Tensor while preserving individual metadata.
371
372    Args:
373        audios: List of Audio objects. They should all have the same number of channels.
374                It is advised that they also share the same sampling rate when required by processing.
375
376    Returns:
377        A tuple containing:
378            - A Tensor of shape (batch_size, num_channels, num_samples),
379            - The sampling rate (as an integer if uniform, or a list otherwise),
380            - A list of each audio's metadata.
381
382    Raises:
383        RuntimeError: If the Audio objects do not share the same number of channels.
384    """
385    sampling_rates = []
386    num_channels_list = []
387    lengths = []
388    batched_audio = []
389    metadatas = []
390
391    for audio in audios:
392        sampling_rates.append(audio.sampling_rate)
393        num_channels_list.append(audio.waveform.shape[0])
394        lengths.append(audio.waveform.shape[1])
395        metadatas.append(audio.metadata)
396
397    if len(set(num_channels_list)) != 1:
398        raise RuntimeError("All audios must have the same number of channels.")
399
400    if len(set(sampling_rates)) != 1:
401        warnings.warn("Not all sampling rates are the same.", UserWarning)
402
403    max_length = max(lengths)
404    for audio in audios:
405        waveform = audio.waveform
406        padding = max_length - waveform.shape[1]
407        if padding > 0:
408            pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype)
409            waveform = torch.cat([waveform, pad], dim=1)
410        batched_audio.append(waveform)
411
412    return_sampling_rate: Union[int, List[int]] = (
413        int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates
414    )
415
416    return torch.stack(batched_audio), return_sampling_rate, metadatas
417
418
419def unbatch_audios(
420    batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]
421) -> List[Audio]:
422    """Unbatches a Tensor of audio data back into a list of Audio objects.
423
424    Args:
425        batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
426        sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
427        metadatas: A list of metadata dictionaries for each audio.
428
429    Returns:
430        A list of Audio objects reconstituted from the batched data.
431
432    Raises:
433        ValueError: If the batched_audio shape is invalid or if the number of items mismatches.
434    """
435    if len(batched_audio.shape) != 3:
436        raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).")
437    if batched_audio.shape[0] != len(metadatas) or (
438        isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates)
439    ):
440        raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.")
441
442    audios = []
443    for i in range(len(metadatas)):
444        sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates
445        audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i]))
446    return audios
class Audio(pydantic.main.BaseModel):
 30class Audio(BaseModel):
 31    """Represents an audio file and its associated metadata.
 32
 33    Users should instantiate Audio via the constructor (from file path or waveform + sampling rate)
 34    or the 'from_stream' method, which yiels Audio objects from a live audio stream.
 35
 36    Attributes:
 37        metadata: A dictionary containing any additional metadata.
 38    """
 39
 40    # Private attributes used for lazy loading and internal state.
 41    _file_path: Union[str, os.PathLike] = PrivateAttr(default="")  # Path to audio file (if not pre-loaded)
 42    _waveform: Optional[torch.Tensor] = PrivateAttr(default=None)  # Audio data (lazy-loaded from file if not provided)
 43    _sampling_rate: Optional[int] = PrivateAttr(default=None)  # Actual sampling rate; loaded on demand
 44    _offset_in_sec: float = PrivateAttr(default=0.0)  # Offset in seconds from which to start loading audio
 45    _duration_in_sec: Optional[float] = PrivateAttr(default=None)  # Duration in seconds to load; None means full file
 46    _backend: Optional[str] = PrivateAttr(default=None)  # Backend to use when loading the audio
 47
 48    # Public fields:
 49    metadata: Dict = Field(default={})
 50    model_config = {"arbitrary_types_allowed": True}
 51
 52    def __init__(self, **data: Any) -> None:  # noqa: ANN401,D417
 53        """Initialize an Audio instance.
 54
 55        Args:
 56            waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
 57            sampling_rate (optional): If provided, sets the sampling rate.
 58                This must be provided if a waveform is supplied.
 59            filepath (optional): File path for lazy loading the waveform if not provided.
 60            offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
 61            duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
 62            backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
 63            metadata (optional): A dictionary of additional metadata.
 64
 65        Raises:
 66            ValueError: If neither waveform nor filepath is provided.
 67        """
 68        waveform = data.pop("waveform", None)
 69        provided_sr = data.pop("sampling_rate", None)
 70        filepath = data.pop("filepath", None)
 71        offset_in_sec = data.pop("offset_in_sec", 0.0)
 72        duration_in_sec = data.pop("duration_in_sec", None)
 73        backend = data.pop("backend", None)
 74        metadata = data.pop("metadata", {})
 75
 76        super().__init__(**data)
 77
 78        if waveform is not None:
 79            # If a waveform and sampling rate are provided, convert and store them;
 80            if provided_sr is None:
 81                raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.")
 82            self._waveform = self.convert_to_tensor(waveform)
 83            self._sampling_rate = provided_sr
 84        else:
 85            # otherwise, a valid filepath is required for lazy loading.
 86            if not filepath:
 87                raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.")
 88            elif not os.path.exists(filepath):
 89                raise FileNotFoundError(f"File {filepath} does not exist.")
 90            else:
 91                self._file_path = filepath
 92
 93        # Validate offset
 94        if offset_in_sec < 0:
 95            raise ValueError("Offset must be a non-negative value")
 96
 97        # Validate duration (allowing -1 to indicate full file)
 98        if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1:
 99            raise ValueError("Duration must be -1 (for full file) or a positive value")
100
101        # Validate backend if provided
102        allowed_backends = {"ffmpeg", "sox", "soundfile"}
103        if backend is not None and backend not in allowed_backends:
104            raise ValueError("Unsupported backend")
105
106        self._offset_in_sec = offset_in_sec
107        self._duration_in_sec = duration_in_sec
108        self._backend = backend
109
110        # Set the metadata
111        self.metadata = metadata
112
113    @property
114    def waveform(self) -> torch.Tensor:
115        """Returns the audio waveform as a torch.Tensor.
116
117        If the waveform has not been loaded yet, it is loaded lazily from the file.
118        """
119        if self._waveform is None:
120            # print("Lazy loading audio data from file...")
121            self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path))
122        assert self._waveform is not None, "Failed to load audio data."
123        return self._waveform
124
125    @property
126    def sampling_rate(self) -> int:
127        """Returns the sampling rate of the audio.
128
129        If the sampling rate is not set and a file is available, it is inferred from the file metadata.
130        """
131        if self._sampling_rate is None:
132            if self._file_path and TORCHAUDIO_AVAILABLE:
133                info = torchaudio.info(self._file_path)
134                self._sampling_rate = info.sample_rate
135            else:
136                raise ValueError("Sampling rate is not available.")
137        assert self._sampling_rate is not None, "Sampling rate should be set."
138        return self._sampling_rate
139
140    @classmethod
141    def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor:
142        """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).
143
144        Args:
145            v: Audio data in the form of a list, NumPy array, or torch.Tensor.
146
147        Returns:
148            A torch.Tensor representation of the audio data.
149        """
150        if isinstance(v, list):
151            temporary_tensor = torch.tensor(v)
152        elif isinstance(v, np.ndarray):
153            temporary_tensor = torch.tensor(v)
154        elif isinstance(v, torch.Tensor):
155            temporary_tensor = v.clone()
156        else:
157            raise ValueError("Unsupported data type for audio conversion.")
158
159        if temporary_tensor.ndim == 1:
160            temporary_tensor = temporary_tensor.unsqueeze(0)
161        return temporary_tensor.to(torch.float32)
162
163    def _lazy_load_data_from_filepath(self, filepath: Union[str, os.PathLike]) -> torch.Tensor:
164        """Lazy-loads audio data from the given filepath.
165
166        Converts the stored offset and duration (in seconds) to the required frame indices for torchaudio.
167
168        Args:
169            filepath: The path to the audio file.
170
171        Returns:
172            A torch.Tensor containing the loaded audio data.
173
174        Raises:
175            ModuleNotFoundError: If torchaudio is not available.
176            ValueError: If the offset or duration exceeds the file duration.
177        """
178        if not TORCHAUDIO_AVAILABLE:
179            raise ModuleNotFoundError(
180                "`torchaudio` is not installed. "
181                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
182            )
183
184        info = torchaudio.info(filepath)
185        self._sampling_rate = info.sample_rate
186        total_frames = info.num_frames
187
188        # Convert offset_in_sec and duration_in_sec to frame indices.
189        frame_offset = int(self._offset_in_sec * self.sampling_rate)
190        if frame_offset > total_frames:
191            raise ValueError(
192                f"Offset ({self._offset_in_sec} s) exceeds the audio file duration "
193                f"({total_frames / self.sampling_rate:.2f} s)."
194            )
195        if self._duration_in_sec is not None and self._duration_in_sec > 0:
196            num_frames = int(self._duration_in_sec * self.sampling_rate)
197            # Ensure we don't exceed the file length.
198            num_frames = min(num_frames, total_frames - frame_offset)
199        else:
200            num_frames = -1  # Indicates full file reading
201
202        array, _ = torchaudio.load(
203            filepath,
204            frame_offset=frame_offset,
205            num_frames=num_frames,
206            backend=self._backend,
207        )
208        return array
209
210    def filepath(self) -> Union[str, None]:
211        """Returns the file path of the audio if available."""
212        if self._file_path:
213            return str(self._file_path)
214        return None
215
216    def generate_id(self) -> str:
217        """Generates a unique identifier for the Audio.
218
219        The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.
220
221        Returns:
222            A string representing the generated unique identifier.
223        """
224        # Use the waveform property so that lazy loading is triggered if needed.
225        unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate))
226        return str(unique_hash)
227
228    def __eq__(self, other: object) -> bool:
229        """Overrides equality to compare Audio objects based on their generated identifiers.
230
231        Args:
232            other: Another object to compare.
233
234        Returns:
235            True if both Audio instances have the same generated identifier, False otherwise.
236        """
237        if isinstance(other, Audio):
238            return self.generate_id() == other.generate_id()
239        return False
240
241    def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]:
242        """Creates a sliding window generator for the audio waveform.
243
244        Each yielded Audio instance corresponds to a window of the waveform.
245
246        Args:
247            window_size: Number of samples in each window.
248            step_size: Number of samples to advance for each window.
249
250        Yields:
251            Audio: A new Audio instance representing the current window.
252        """
253        if step_size > window_size:
254            warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.")
255
256        num_samples = self.waveform.size(-1)
257        current_position = 0
258
259        while current_position < num_samples:
260            end_position = min(current_position + window_size, num_samples)
261            window_waveform = self.waveform[:, current_position:end_position]
262
263            yield Audio(
264                waveform=window_waveform,
265                sampling_rate=self.sampling_rate,
266                metadata=self.metadata,
267            )
268            current_position += step_size
269
270    def save_to_file(
271        self,
272        file_path: Union[str, os.PathLike],
273        format: Optional[str] = None,
274        encoding: Optional[str] = None,
275        bits_per_sample: Optional[int] = None,
276        buffer_size: int = 4096,
277        backend: Optional[str] = None,
278        compression: Optional[Union[float, int]] = None,
279    ) -> None:
280        """Saves the Audio object to a file using torchaudio.save.
281
282        Args:
283            file_path: Destination file path.
284            format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
285            encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
286            bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
287            buffer_size: Buffer size in bytes for processing.
288            backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
289            compression: Compression level for supported formats (e.g. mp3, flac, ogg).
290
291        Raises:
292            ModuleNotFoundError: If torchaudio is not available.
293            ValueError: If the waveform dimensions or sampling rate are invalid.
294            RuntimeError: If saving fails.
295        """
296        if not TORCHAUDIO_AVAILABLE:
297            raise ModuleNotFoundError(
298                "`torchaudio` is not installed. "
299                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
300            )
301
302        if self.waveform.ndim != 2:
303            raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).")
304        if self.sampling_rate <= 0:
305            raise ValueError("Sampling rate must be a positive integer.")
306
307        output_dir = os.path.dirname(file_path)
308        if not os.access(output_dir, os.W_OK):
309            raise RuntimeError(f"Output directory '{output_dir}' is not writable.")
310
311        try:
312            if not os.path.exists(output_dir):
313                os.makedirs(output_dir)
314            torchaudio.save(
315                uri=file_path,
316                src=self.waveform,
317                sample_rate=self.sampling_rate,
318                channels_first=True,
319                format=format,
320                encoding=encoding,
321                bits_per_sample=bits_per_sample,
322                buffer_size=buffer_size,
323                backend=backend,
324                compression=compression,
325            )
326        except Exception as e:
327            raise RuntimeError(f"Error saving audio to file: {e}") from e
328
329    @classmethod
330    def from_stream(
331        cls,
332        stream_source: Union[str, os.PathLike, bytes],
333        chunk_duration_in_sec: float = 1.0,
334        metadata: Optional[Dict] = None,
335    ) -> Generator["Audio", None, None]:
336        """Yield Audio objects from a live audio stream in fixed-duration chunks.
337
338        Args:
339            stream_source: A file path, stream, or bytes-like object.
340            chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
341            metadata: Additional metadata for each chunk.
342
343        Yields:
344            Audio objects for each chunk read from the stream.
345        """
346        if not SOUNDFILE_AVAILABLE:
347            raise ModuleNotFoundError(
348                "`soundfile` is not installed. "
349                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
350            )
351
352        if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source):
353            raise FileNotFoundError(f"File {stream_source} does not exist.")
354
355        with sf.SoundFile(stream_source, "r") as audio_file:
356            sampling_rate = audio_file.samplerate
357            chunk_frames = int(chunk_duration_in_sec * sampling_rate)
358
359            while True:
360                chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True)
361                if chunk.shape[0] == 0:
362                    break
363                yield cls(
364                    waveform=chunk.T,
365                    sampling_rate=sampling_rate,
366                    metadata=metadata if metadata else {},
367                )

Represents an audio file and its associated metadata.

Users should instantiate Audio via the constructor (from file path or waveform + sampling rate) or the 'from_stream' method, which yiels Audio objects from a live audio stream.

Attributes:
  • metadata: A dictionary containing any additional metadata.
Audio(**data: Any)
 52    def __init__(self, **data: Any) -> None:  # noqa: ANN401,D417
 53        """Initialize an Audio instance.
 54
 55        Args:
 56            waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
 57            sampling_rate (optional): If provided, sets the sampling rate.
 58                This must be provided if a waveform is supplied.
 59            filepath (optional): File path for lazy loading the waveform if not provided.
 60            offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
 61            duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
 62            backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
 63            metadata (optional): A dictionary of additional metadata.
 64
 65        Raises:
 66            ValueError: If neither waveform nor filepath is provided.
 67        """
 68        waveform = data.pop("waveform", None)
 69        provided_sr = data.pop("sampling_rate", None)
 70        filepath = data.pop("filepath", None)
 71        offset_in_sec = data.pop("offset_in_sec", 0.0)
 72        duration_in_sec = data.pop("duration_in_sec", None)
 73        backend = data.pop("backend", None)
 74        metadata = data.pop("metadata", {})
 75
 76        super().__init__(**data)
 77
 78        if waveform is not None:
 79            # If a waveform and sampling rate are provided, convert and store them;
 80            if provided_sr is None:
 81                raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.")
 82            self._waveform = self.convert_to_tensor(waveform)
 83            self._sampling_rate = provided_sr
 84        else:
 85            # otherwise, a valid filepath is required for lazy loading.
 86            if not filepath:
 87                raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.")
 88            elif not os.path.exists(filepath):
 89                raise FileNotFoundError(f"File {filepath} does not exist.")
 90            else:
 91                self._file_path = filepath
 92
 93        # Validate offset
 94        if offset_in_sec < 0:
 95            raise ValueError("Offset must be a non-negative value")
 96
 97        # Validate duration (allowing -1 to indicate full file)
 98        if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1:
 99            raise ValueError("Duration must be -1 (for full file) or a positive value")
100
101        # Validate backend if provided
102        allowed_backends = {"ffmpeg", "sox", "soundfile"}
103        if backend is not None and backend not in allowed_backends:
104            raise ValueError("Unsupported backend")
105
106        self._offset_in_sec = offset_in_sec
107        self._duration_in_sec = duration_in_sec
108        self._backend = backend
109
110        # Set the metadata
111        self.metadata = metadata

Initialize an Audio instance.

Arguments:
  • waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
  • sampling_rate (optional): If provided, sets the sampling rate. This must be provided if a waveform is supplied.
  • filepath (optional): File path for lazy loading the waveform if not provided.
  • offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
  • duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
  • backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
  • metadata (optional): A dictionary of additional metadata.
Raises:
  • ValueError: If neither waveform nor filepath is provided.
metadata: Dict
model_config = {'arbitrary_types_allowed': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

waveform: torch.Tensor
113    @property
114    def waveform(self) -> torch.Tensor:
115        """Returns the audio waveform as a torch.Tensor.
116
117        If the waveform has not been loaded yet, it is loaded lazily from the file.
118        """
119        if self._waveform is None:
120            # print("Lazy loading audio data from file...")
121            self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path))
122        assert self._waveform is not None, "Failed to load audio data."
123        return self._waveform

Returns the audio waveform as a torch.Tensor.

If the waveform has not been loaded yet, it is loaded lazily from the file.

sampling_rate: int
125    @property
126    def sampling_rate(self) -> int:
127        """Returns the sampling rate of the audio.
128
129        If the sampling rate is not set and a file is available, it is inferred from the file metadata.
130        """
131        if self._sampling_rate is None:
132            if self._file_path and TORCHAUDIO_AVAILABLE:
133                info = torchaudio.info(self._file_path)
134                self._sampling_rate = info.sample_rate
135            else:
136                raise ValueError("Sampling rate is not available.")
137        assert self._sampling_rate is not None, "Sampling rate should be set."
138        return self._sampling_rate

Returns the sampling rate of the audio.

If the sampling rate is not set and a file is available, it is inferred from the file metadata.

@classmethod
def convert_to_tensor( cls, v: Union[List[float], List[List[float]], numpy.ndarray, torch.Tensor]) -> torch.Tensor:
140    @classmethod
141    def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor:
142        """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).
143
144        Args:
145            v: Audio data in the form of a list, NumPy array, or torch.Tensor.
146
147        Returns:
148            A torch.Tensor representation of the audio data.
149        """
150        if isinstance(v, list):
151            temporary_tensor = torch.tensor(v)
152        elif isinstance(v, np.ndarray):
153            temporary_tensor = torch.tensor(v)
154        elif isinstance(v, torch.Tensor):
155            temporary_tensor = v.clone()
156        else:
157            raise ValueError("Unsupported data type for audio conversion.")
158
159        if temporary_tensor.ndim == 1:
160            temporary_tensor = temporary_tensor.unsqueeze(0)
161        return temporary_tensor.to(torch.float32)

Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).

Arguments:
  • v: Audio data in the form of a list, NumPy array, or torch.Tensor.
Returns:

A torch.Tensor representation of the audio data.

def filepath(self) -> Optional[str]:
210    def filepath(self) -> Union[str, None]:
211        """Returns the file path of the audio if available."""
212        if self._file_path:
213            return str(self._file_path)
214        return None

Returns the file path of the audio if available.

def generate_id(self) -> str:
216    def generate_id(self) -> str:
217        """Generates a unique identifier for the Audio.
218
219        The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.
220
221        Returns:
222            A string representing the generated unique identifier.
223        """
224        # Use the waveform property so that lazy loading is triggered if needed.
225        unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate))
226        return str(unique_hash)

Generates a unique identifier for the Audio.

The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.

Returns:

A string representing the generated unique identifier.

def window_generator( self, window_size: int, step_size: int) -> Generator[Audio, NoneType, NoneType]:
241    def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]:
242        """Creates a sliding window generator for the audio waveform.
243
244        Each yielded Audio instance corresponds to a window of the waveform.
245
246        Args:
247            window_size: Number of samples in each window.
248            step_size: Number of samples to advance for each window.
249
250        Yields:
251            Audio: A new Audio instance representing the current window.
252        """
253        if step_size > window_size:
254            warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.")
255
256        num_samples = self.waveform.size(-1)
257        current_position = 0
258
259        while current_position < num_samples:
260            end_position = min(current_position + window_size, num_samples)
261            window_waveform = self.waveform[:, current_position:end_position]
262
263            yield Audio(
264                waveform=window_waveform,
265                sampling_rate=self.sampling_rate,
266                metadata=self.metadata,
267            )
268            current_position += step_size

Creates a sliding window generator for the audio waveform.

Each yielded Audio instance corresponds to a window of the waveform.

Arguments:
  • window_size: Number of samples in each window.
  • step_size: Number of samples to advance for each window.
Yields:

Audio: A new Audio instance representing the current window.

def save_to_file( self, file_path: Union[str, os.PathLike], format: Optional[str] = None, encoding: Optional[str] = None, bits_per_sample: Optional[int] = None, buffer_size: int = 4096, backend: Optional[str] = None, compression: Union[float, int, NoneType] = None) -> None:
270    def save_to_file(
271        self,
272        file_path: Union[str, os.PathLike],
273        format: Optional[str] = None,
274        encoding: Optional[str] = None,
275        bits_per_sample: Optional[int] = None,
276        buffer_size: int = 4096,
277        backend: Optional[str] = None,
278        compression: Optional[Union[float, int]] = None,
279    ) -> None:
280        """Saves the Audio object to a file using torchaudio.save.
281
282        Args:
283            file_path: Destination file path.
284            format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
285            encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
286            bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
287            buffer_size: Buffer size in bytes for processing.
288            backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
289            compression: Compression level for supported formats (e.g. mp3, flac, ogg).
290
291        Raises:
292            ModuleNotFoundError: If torchaudio is not available.
293            ValueError: If the waveform dimensions or sampling rate are invalid.
294            RuntimeError: If saving fails.
295        """
296        if not TORCHAUDIO_AVAILABLE:
297            raise ModuleNotFoundError(
298                "`torchaudio` is not installed. "
299                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
300            )
301
302        if self.waveform.ndim != 2:
303            raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).")
304        if self.sampling_rate <= 0:
305            raise ValueError("Sampling rate must be a positive integer.")
306
307        output_dir = os.path.dirname(file_path)
308        if not os.access(output_dir, os.W_OK):
309            raise RuntimeError(f"Output directory '{output_dir}' is not writable.")
310
311        try:
312            if not os.path.exists(output_dir):
313                os.makedirs(output_dir)
314            torchaudio.save(
315                uri=file_path,
316                src=self.waveform,
317                sample_rate=self.sampling_rate,
318                channels_first=True,
319                format=format,
320                encoding=encoding,
321                bits_per_sample=bits_per_sample,
322                buffer_size=buffer_size,
323                backend=backend,
324                compression=compression,
325            )
326        except Exception as e:
327            raise RuntimeError(f"Error saving audio to file: {e}") from e

Saves the Audio object to a file using torchaudio.save.

Arguments:
  • file_path: Destination file path.
  • format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
  • encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
  • bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
  • buffer_size: Buffer size in bytes for processing.
  • backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
  • compression: Compression level for supported formats (e.g. mp3, flac, ogg).
Raises:
  • ModuleNotFoundError: If torchaudio is not available.
  • ValueError: If the waveform dimensions or sampling rate are invalid.
  • RuntimeError: If saving fails.
@classmethod
def from_stream( cls, stream_source: Union[str, os.PathLike, bytes], chunk_duration_in_sec: float = 1.0, metadata: Optional[Dict] = None) -> Generator[Audio, NoneType, NoneType]:
329    @classmethod
330    def from_stream(
331        cls,
332        stream_source: Union[str, os.PathLike, bytes],
333        chunk_duration_in_sec: float = 1.0,
334        metadata: Optional[Dict] = None,
335    ) -> Generator["Audio", None, None]:
336        """Yield Audio objects from a live audio stream in fixed-duration chunks.
337
338        Args:
339            stream_source: A file path, stream, or bytes-like object.
340            chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
341            metadata: Additional metadata for each chunk.
342
343        Yields:
344            Audio objects for each chunk read from the stream.
345        """
346        if not SOUNDFILE_AVAILABLE:
347            raise ModuleNotFoundError(
348                "`soundfile` is not installed. "
349                "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
350            )
351
352        if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source):
353            raise FileNotFoundError(f"File {stream_source} does not exist.")
354
355        with sf.SoundFile(stream_source, "r") as audio_file:
356            sampling_rate = audio_file.samplerate
357            chunk_frames = int(chunk_duration_in_sec * sampling_rate)
358
359            while True:
360                chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True)
361                if chunk.shape[0] == 0:
362                    break
363                yield cls(
364                    waveform=chunk.T,
365                    sampling_rate=sampling_rate,
366                    metadata=metadata if metadata else {},
367                )

Yield Audio objects from a live audio stream in fixed-duration chunks.

Arguments:
  • stream_source: A file path, stream, or bytes-like object.
  • chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
  • metadata: Additional metadata for each chunk.
Yields:

Audio objects for each chunk read from the stream.

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:
  • self: The BaseModel instance.
  • context: The context.
def batch_audios( audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]:
370def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]:
371    """Batches a list of Audio objects into a single Tensor while preserving individual metadata.
372
373    Args:
374        audios: List of Audio objects. They should all have the same number of channels.
375                It is advised that they also share the same sampling rate when required by processing.
376
377    Returns:
378        A tuple containing:
379            - A Tensor of shape (batch_size, num_channels, num_samples),
380            - The sampling rate (as an integer if uniform, or a list otherwise),
381            - A list of each audio's metadata.
382
383    Raises:
384        RuntimeError: If the Audio objects do not share the same number of channels.
385    """
386    sampling_rates = []
387    num_channels_list = []
388    lengths = []
389    batched_audio = []
390    metadatas = []
391
392    for audio in audios:
393        sampling_rates.append(audio.sampling_rate)
394        num_channels_list.append(audio.waveform.shape[0])
395        lengths.append(audio.waveform.shape[1])
396        metadatas.append(audio.metadata)
397
398    if len(set(num_channels_list)) != 1:
399        raise RuntimeError("All audios must have the same number of channels.")
400
401    if len(set(sampling_rates)) != 1:
402        warnings.warn("Not all sampling rates are the same.", UserWarning)
403
404    max_length = max(lengths)
405    for audio in audios:
406        waveform = audio.waveform
407        padding = max_length - waveform.shape[1]
408        if padding > 0:
409            pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype)
410            waveform = torch.cat([waveform, pad], dim=1)
411        batched_audio.append(waveform)
412
413    return_sampling_rate: Union[int, List[int]] = (
414        int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates
415    )
416
417    return torch.stack(batched_audio), return_sampling_rate, metadatas

Batches a list of Audio objects into a single Tensor while preserving individual metadata.

Arguments:
  • audios: List of Audio objects. They should all have the same number of channels. It is advised that they also share the same sampling rate when required by processing.
Returns:

A tuple containing: - A Tensor of shape (batch_size, num_channels, num_samples), - The sampling rate (as an integer if uniform, or a list otherwise), - A list of each audio's metadata.

Raises:
  • RuntimeError: If the Audio objects do not share the same number of channels.
def unbatch_audios( batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]) -> List[Audio]:
420def unbatch_audios(
421    batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict]
422) -> List[Audio]:
423    """Unbatches a Tensor of audio data back into a list of Audio objects.
424
425    Args:
426        batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
427        sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
428        metadatas: A list of metadata dictionaries for each audio.
429
430    Returns:
431        A list of Audio objects reconstituted from the batched data.
432
433    Raises:
434        ValueError: If the batched_audio shape is invalid or if the number of items mismatches.
435    """
436    if len(batched_audio.shape) != 3:
437        raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).")
438    if batched_audio.shape[0] != len(metadatas) or (
439        isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates)
440    ):
441        raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.")
442
443    audios = []
444    for i in range(len(metadatas)):
445        sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates
446        audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i]))
447    return audios

Unbatches a Tensor of audio data back into a list of Audio objects.

Arguments:
  • batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
  • sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
  • metadatas: A list of metadata dictionaries for each audio.
Returns:

A list of Audio objects reconstituted from the batched data.

Raises:
  • ValueError: If the batched_audio shape is invalid or if the number of items mismatches.