senselab.audio.data_structures.audio
Audio data structure module.
1"""Audio data structure module.""" 2 3try: 4 import torchaudio 5 6 TORCHAUDIO_AVAILABLE = True 7except ModuleNotFoundError: 8 TORCHAUDIO_AVAILABLE = False 9 10try: 11 import soundfile as sf 12 13 SOUNDFILE_AVAILABLE = True 14except ModuleNotFoundError: 15 SOUNDFILE_AVAILABLE = False 16 17import os 18import uuid 19import warnings 20from typing import Any, Dict, Generator, List, Optional, Tuple, Union 21 22import numpy as np 23import torch 24from pydantic import BaseModel, Field, PrivateAttr 25 26from senselab.utils.constants import SENSELAB_NAMESPACE 27 28 29class Audio(BaseModel): 30 """Represents an audio file and its associated metadata. 31 32 Users should instantiate Audio via the constructor (from file path or waveform + sampling rate) 33 or the 'from_stream' method, which yiels Audio objects from a live audio stream. 34 35 Attributes: 36 metadata: A dictionary containing any additional metadata. 37 """ 38 39 # Private attributes used for lazy loading and internal state. 40 _file_path: Union[str, os.PathLike] = PrivateAttr(default="") # Path to audio file (if not pre-loaded) 41 _waveform: Optional[torch.Tensor] = PrivateAttr(default=None) # Audio data (lazy-loaded from file if not provided) 42 _sampling_rate: Optional[int] = PrivateAttr(default=None) # Actual sampling rate; loaded on demand 43 _offset_in_sec: float = PrivateAttr(default=0.0) # Offset in seconds from which to start loading audio 44 _duration_in_sec: Optional[float] = PrivateAttr(default=None) # Duration in seconds to load; None means full file 45 _backend: Optional[str] = PrivateAttr(default=None) # Backend to use when loading the audio 46 47 # Public fields: 48 metadata: Dict = Field(default={}) 49 model_config = {"arbitrary_types_allowed": True} 50 51 def __init__(self, **data: Any) -> None: # noqa: ANN401,D417 52 """Initialize an Audio instance. 53 54 Args: 55 waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor. 56 sampling_rate (optional): If provided, sets the sampling rate. 57 This must be provided if a waveform is supplied. 58 filepath (optional): File path for lazy loading the waveform if not provided. 59 offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0. 60 duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded. 61 backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile"). 62 metadata (optional): A dictionary of additional metadata. 63 64 Raises: 65 ValueError: If neither waveform nor filepath is provided. 66 """ 67 waveform = data.pop("waveform", None) 68 provided_sr = data.pop("sampling_rate", None) 69 filepath = data.pop("filepath", None) 70 offset_in_sec = data.pop("offset_in_sec", 0.0) 71 duration_in_sec = data.pop("duration_in_sec", None) 72 backend = data.pop("backend", None) 73 metadata = data.pop("metadata", {}) 74 75 super().__init__(**data) 76 77 if waveform is not None: 78 # If a waveform and sampling rate are provided, convert and store them; 79 if provided_sr is None: 80 raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.") 81 self._waveform = self.convert_to_tensor(waveform) 82 self._sampling_rate = provided_sr 83 else: 84 # otherwise, a valid filepath is required for lazy loading. 85 if not filepath: 86 raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.") 87 elif not os.path.exists(filepath): 88 raise FileNotFoundError(f"File {filepath} does not exist.") 89 else: 90 self._file_path = filepath 91 92 # Validate offset 93 if offset_in_sec < 0: 94 raise ValueError("Offset must be a non-negative value") 95 96 # Validate duration (allowing -1 to indicate full file) 97 if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1: 98 raise ValueError("Duration must be -1 (for full file) or a positive value") 99 100 # Validate backend if provided 101 allowed_backends = {"ffmpeg", "sox", "soundfile"} 102 if backend is not None and backend not in allowed_backends: 103 raise ValueError("Unsupported backend") 104 105 self._offset_in_sec = offset_in_sec 106 self._duration_in_sec = duration_in_sec 107 self._backend = backend 108 109 # Set the metadata 110 self.metadata = metadata 111 112 @property 113 def waveform(self) -> torch.Tensor: 114 """Returns the audio waveform as a torch.Tensor. 115 116 If the waveform has not been loaded yet, it is loaded lazily from the file. 117 """ 118 if self._waveform is None: 119 # print("Lazy loading audio data from file...") 120 self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path)) 121 assert self._waveform is not None, "Failed to load audio data." 122 return self._waveform 123 124 @property 125 def sampling_rate(self) -> int: 126 """Returns the sampling rate of the audio. 127 128 If the sampling rate is not set and a file is available, it is inferred from the file metadata. 129 """ 130 if self._sampling_rate is None: 131 if self._file_path and TORCHAUDIO_AVAILABLE: 132 info = torchaudio.info(self._file_path) 133 self._sampling_rate = info.sample_rate 134 else: 135 raise ValueError("Sampling rate is not available.") 136 assert self._sampling_rate is not None, "Sampling rate should be set." 137 return self._sampling_rate 138 139 @classmethod 140 def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor: 141 """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples). 142 143 Args: 144 v: Audio data in the form of a list, NumPy array, or torch.Tensor. 145 146 Returns: 147 A torch.Tensor representation of the audio data. 148 """ 149 if isinstance(v, list): 150 temporary_tensor = torch.tensor(v) 151 elif isinstance(v, np.ndarray): 152 temporary_tensor = torch.tensor(v) 153 elif isinstance(v, torch.Tensor): 154 temporary_tensor = v.clone() 155 else: 156 raise ValueError("Unsupported data type for audio conversion.") 157 158 if temporary_tensor.ndim == 1: 159 temporary_tensor = temporary_tensor.unsqueeze(0) 160 return temporary_tensor.to(torch.float32) 161 162 def _lazy_load_data_from_filepath(self, filepath: Union[str, os.PathLike]) -> torch.Tensor: 163 """Lazy-loads audio data from the given filepath. 164 165 Converts the stored offset and duration (in seconds) to the required frame indices for torchaudio. 166 167 Args: 168 filepath: The path to the audio file. 169 170 Returns: 171 A torch.Tensor containing the loaded audio data. 172 173 Raises: 174 ModuleNotFoundError: If torchaudio is not available. 175 ValueError: If the offset or duration exceeds the file duration. 176 """ 177 if not TORCHAUDIO_AVAILABLE: 178 raise ModuleNotFoundError( 179 "`torchaudio` is not installed. " 180 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 181 ) 182 183 info = torchaudio.info(filepath) 184 self._sampling_rate = info.sample_rate 185 total_frames = info.num_frames 186 187 # Convert offset_in_sec and duration_in_sec to frame indices. 188 frame_offset = int(self._offset_in_sec * self.sampling_rate) 189 if frame_offset > total_frames: 190 raise ValueError( 191 f"Offset ({self._offset_in_sec} s) exceeds the audio file duration " 192 f"({total_frames / self.sampling_rate:.2f} s)." 193 ) 194 if self._duration_in_sec is not None and self._duration_in_sec > 0: 195 num_frames = int(self._duration_in_sec * self.sampling_rate) 196 # Ensure we don't exceed the file length. 197 num_frames = min(num_frames, total_frames - frame_offset) 198 else: 199 num_frames = -1 # Indicates full file reading 200 201 array, _ = torchaudio.load( 202 filepath, 203 frame_offset=frame_offset, 204 num_frames=num_frames, 205 backend=self._backend, 206 ) 207 return array 208 209 def filepath(self) -> Union[str, None]: 210 """Returns the file path of the audio if available.""" 211 if self._file_path: 212 return str(self._file_path) 213 return None 214 215 def generate_id(self) -> str: 216 """Generates a unique identifier for the Audio. 217 218 The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate. 219 220 Returns: 221 A string representing the generated unique identifier. 222 """ 223 # Use the waveform property so that lazy loading is triggered if needed. 224 unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate)) 225 return str(unique_hash) 226 227 def __eq__(self, other: object) -> bool: 228 """Overrides equality to compare Audio objects based on their generated identifiers. 229 230 Args: 231 other: Another object to compare. 232 233 Returns: 234 True if both Audio instances have the same generated identifier, False otherwise. 235 """ 236 if isinstance(other, Audio): 237 return self.generate_id() == other.generate_id() 238 return False 239 240 def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]: 241 """Creates a sliding window generator for the audio waveform. 242 243 Each yielded Audio instance corresponds to a window of the waveform. 244 245 Args: 246 window_size: Number of samples in each window. 247 step_size: Number of samples to advance for each window. 248 249 Yields: 250 Audio: A new Audio instance representing the current window. 251 """ 252 if step_size > window_size: 253 warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.") 254 255 num_samples = self.waveform.size(-1) 256 current_position = 0 257 258 while current_position < num_samples: 259 end_position = min(current_position + window_size, num_samples) 260 window_waveform = self.waveform[:, current_position:end_position] 261 262 yield Audio( 263 waveform=window_waveform, 264 sampling_rate=self.sampling_rate, 265 metadata=self.metadata, 266 ) 267 current_position += step_size 268 269 def save_to_file( 270 self, 271 file_path: Union[str, os.PathLike], 272 format: Optional[str] = None, 273 encoding: Optional[str] = None, 274 bits_per_sample: Optional[int] = None, 275 buffer_size: int = 4096, 276 backend: Optional[str] = None, 277 compression: Optional[Union[float, int]] = None, 278 ) -> None: 279 """Saves the Audio object to a file using torchaudio.save. 280 281 Args: 282 file_path: Destination file path. 283 format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None. 284 encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac. 285 bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64). 286 buffer_size: Buffer size in bytes for processing. 287 backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile"). 288 compression: Compression level for supported formats (e.g. mp3, flac, ogg). 289 290 Raises: 291 ModuleNotFoundError: If torchaudio is not available. 292 ValueError: If the waveform dimensions or sampling rate are invalid. 293 RuntimeError: If saving fails. 294 """ 295 if not TORCHAUDIO_AVAILABLE: 296 raise ModuleNotFoundError( 297 "`torchaudio` is not installed. " 298 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 299 ) 300 301 if self.waveform.ndim != 2: 302 raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).") 303 if self.sampling_rate <= 0: 304 raise ValueError("Sampling rate must be a positive integer.") 305 306 output_dir = os.path.dirname(file_path) 307 if not os.access(output_dir, os.W_OK): 308 raise RuntimeError(f"Output directory '{output_dir}' is not writable.") 309 310 try: 311 if not os.path.exists(output_dir): 312 os.makedirs(output_dir) 313 torchaudio.save( 314 uri=file_path, 315 src=self.waveform, 316 sample_rate=self.sampling_rate, 317 channels_first=True, 318 format=format, 319 encoding=encoding, 320 bits_per_sample=bits_per_sample, 321 buffer_size=buffer_size, 322 backend=backend, 323 compression=compression, 324 ) 325 except Exception as e: 326 raise RuntimeError(f"Error saving audio to file: {e}") from e 327 328 @classmethod 329 def from_stream( 330 cls, 331 stream_source: Union[str, os.PathLike, bytes], 332 chunk_duration_in_sec: float = 1.0, 333 metadata: Optional[Dict] = None, 334 ) -> Generator["Audio", None, None]: 335 """Yield Audio objects from a live audio stream in fixed-duration chunks. 336 337 Args: 338 stream_source: A file path, stream, or bytes-like object. 339 chunk_duration_in_sec: Duration (in seconds) of each audio chunk. 340 metadata: Additional metadata for each chunk. 341 342 Yields: 343 Audio objects for each chunk read from the stream. 344 """ 345 if not SOUNDFILE_AVAILABLE: 346 raise ModuleNotFoundError( 347 "`soundfile` is not installed. " 348 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 349 ) 350 351 if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source): 352 raise FileNotFoundError(f"File {stream_source} does not exist.") 353 354 with sf.SoundFile(stream_source, "r") as audio_file: 355 sampling_rate = audio_file.samplerate 356 chunk_frames = int(chunk_duration_in_sec * sampling_rate) 357 358 while True: 359 chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True) 360 if chunk.shape[0] == 0: 361 break 362 yield cls( 363 waveform=chunk.T, 364 sampling_rate=sampling_rate, 365 metadata=metadata if metadata else {}, 366 ) 367 368 369def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]: 370 """Batches a list of Audio objects into a single Tensor while preserving individual metadata. 371 372 Args: 373 audios: List of Audio objects. They should all have the same number of channels. 374 It is advised that they also share the same sampling rate when required by processing. 375 376 Returns: 377 A tuple containing: 378 - A Tensor of shape (batch_size, num_channels, num_samples), 379 - The sampling rate (as an integer if uniform, or a list otherwise), 380 - A list of each audio's metadata. 381 382 Raises: 383 RuntimeError: If the Audio objects do not share the same number of channels. 384 """ 385 sampling_rates = [] 386 num_channels_list = [] 387 lengths = [] 388 batched_audio = [] 389 metadatas = [] 390 391 for audio in audios: 392 sampling_rates.append(audio.sampling_rate) 393 num_channels_list.append(audio.waveform.shape[0]) 394 lengths.append(audio.waveform.shape[1]) 395 metadatas.append(audio.metadata) 396 397 if len(set(num_channels_list)) != 1: 398 raise RuntimeError("All audios must have the same number of channels.") 399 400 if len(set(sampling_rates)) != 1: 401 warnings.warn("Not all sampling rates are the same.", UserWarning) 402 403 max_length = max(lengths) 404 for audio in audios: 405 waveform = audio.waveform 406 padding = max_length - waveform.shape[1] 407 if padding > 0: 408 pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype) 409 waveform = torch.cat([waveform, pad], dim=1) 410 batched_audio.append(waveform) 411 412 return_sampling_rate: Union[int, List[int]] = ( 413 int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates 414 ) 415 416 return torch.stack(batched_audio), return_sampling_rate, metadatas 417 418 419def unbatch_audios( 420 batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict] 421) -> List[Audio]: 422 """Unbatches a Tensor of audio data back into a list of Audio objects. 423 424 Args: 425 batched_audio: Tensor of shape (batch_size, num_channels, num_samples). 426 sampling_rates: A single sampling rate (if uniform) or a list of sampling rates. 427 metadatas: A list of metadata dictionaries for each audio. 428 429 Returns: 430 A list of Audio objects reconstituted from the batched data. 431 432 Raises: 433 ValueError: If the batched_audio shape is invalid or if the number of items mismatches. 434 """ 435 if len(batched_audio.shape) != 3: 436 raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).") 437 if batched_audio.shape[0] != len(metadatas) or ( 438 isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates) 439 ): 440 raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.") 441 442 audios = [] 443 for i in range(len(metadatas)): 444 sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates 445 audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i])) 446 return audios
30class Audio(BaseModel): 31 """Represents an audio file and its associated metadata. 32 33 Users should instantiate Audio via the constructor (from file path or waveform + sampling rate) 34 or the 'from_stream' method, which yiels Audio objects from a live audio stream. 35 36 Attributes: 37 metadata: A dictionary containing any additional metadata. 38 """ 39 40 # Private attributes used for lazy loading and internal state. 41 _file_path: Union[str, os.PathLike] = PrivateAttr(default="") # Path to audio file (if not pre-loaded) 42 _waveform: Optional[torch.Tensor] = PrivateAttr(default=None) # Audio data (lazy-loaded from file if not provided) 43 _sampling_rate: Optional[int] = PrivateAttr(default=None) # Actual sampling rate; loaded on demand 44 _offset_in_sec: float = PrivateAttr(default=0.0) # Offset in seconds from which to start loading audio 45 _duration_in_sec: Optional[float] = PrivateAttr(default=None) # Duration in seconds to load; None means full file 46 _backend: Optional[str] = PrivateAttr(default=None) # Backend to use when loading the audio 47 48 # Public fields: 49 metadata: Dict = Field(default={}) 50 model_config = {"arbitrary_types_allowed": True} 51 52 def __init__(self, **data: Any) -> None: # noqa: ANN401,D417 53 """Initialize an Audio instance. 54 55 Args: 56 waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor. 57 sampling_rate (optional): If provided, sets the sampling rate. 58 This must be provided if a waveform is supplied. 59 filepath (optional): File path for lazy loading the waveform if not provided. 60 offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0. 61 duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded. 62 backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile"). 63 metadata (optional): A dictionary of additional metadata. 64 65 Raises: 66 ValueError: If neither waveform nor filepath is provided. 67 """ 68 waveform = data.pop("waveform", None) 69 provided_sr = data.pop("sampling_rate", None) 70 filepath = data.pop("filepath", None) 71 offset_in_sec = data.pop("offset_in_sec", 0.0) 72 duration_in_sec = data.pop("duration_in_sec", None) 73 backend = data.pop("backend", None) 74 metadata = data.pop("metadata", {}) 75 76 super().__init__(**data) 77 78 if waveform is not None: 79 # If a waveform and sampling rate are provided, convert and store them; 80 if provided_sr is None: 81 raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.") 82 self._waveform = self.convert_to_tensor(waveform) 83 self._sampling_rate = provided_sr 84 else: 85 # otherwise, a valid filepath is required for lazy loading. 86 if not filepath: 87 raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.") 88 elif not os.path.exists(filepath): 89 raise FileNotFoundError(f"File {filepath} does not exist.") 90 else: 91 self._file_path = filepath 92 93 # Validate offset 94 if offset_in_sec < 0: 95 raise ValueError("Offset must be a non-negative value") 96 97 # Validate duration (allowing -1 to indicate full file) 98 if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1: 99 raise ValueError("Duration must be -1 (for full file) or a positive value") 100 101 # Validate backend if provided 102 allowed_backends = {"ffmpeg", "sox", "soundfile"} 103 if backend is not None and backend not in allowed_backends: 104 raise ValueError("Unsupported backend") 105 106 self._offset_in_sec = offset_in_sec 107 self._duration_in_sec = duration_in_sec 108 self._backend = backend 109 110 # Set the metadata 111 self.metadata = metadata 112 113 @property 114 def waveform(self) -> torch.Tensor: 115 """Returns the audio waveform as a torch.Tensor. 116 117 If the waveform has not been loaded yet, it is loaded lazily from the file. 118 """ 119 if self._waveform is None: 120 # print("Lazy loading audio data from file...") 121 self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path)) 122 assert self._waveform is not None, "Failed to load audio data." 123 return self._waveform 124 125 @property 126 def sampling_rate(self) -> int: 127 """Returns the sampling rate of the audio. 128 129 If the sampling rate is not set and a file is available, it is inferred from the file metadata. 130 """ 131 if self._sampling_rate is None: 132 if self._file_path and TORCHAUDIO_AVAILABLE: 133 info = torchaudio.info(self._file_path) 134 self._sampling_rate = info.sample_rate 135 else: 136 raise ValueError("Sampling rate is not available.") 137 assert self._sampling_rate is not None, "Sampling rate should be set." 138 return self._sampling_rate 139 140 @classmethod 141 def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor: 142 """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples). 143 144 Args: 145 v: Audio data in the form of a list, NumPy array, or torch.Tensor. 146 147 Returns: 148 A torch.Tensor representation of the audio data. 149 """ 150 if isinstance(v, list): 151 temporary_tensor = torch.tensor(v) 152 elif isinstance(v, np.ndarray): 153 temporary_tensor = torch.tensor(v) 154 elif isinstance(v, torch.Tensor): 155 temporary_tensor = v.clone() 156 else: 157 raise ValueError("Unsupported data type for audio conversion.") 158 159 if temporary_tensor.ndim == 1: 160 temporary_tensor = temporary_tensor.unsqueeze(0) 161 return temporary_tensor.to(torch.float32) 162 163 def _lazy_load_data_from_filepath(self, filepath: Union[str, os.PathLike]) -> torch.Tensor: 164 """Lazy-loads audio data from the given filepath. 165 166 Converts the stored offset and duration (in seconds) to the required frame indices for torchaudio. 167 168 Args: 169 filepath: The path to the audio file. 170 171 Returns: 172 A torch.Tensor containing the loaded audio data. 173 174 Raises: 175 ModuleNotFoundError: If torchaudio is not available. 176 ValueError: If the offset or duration exceeds the file duration. 177 """ 178 if not TORCHAUDIO_AVAILABLE: 179 raise ModuleNotFoundError( 180 "`torchaudio` is not installed. " 181 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 182 ) 183 184 info = torchaudio.info(filepath) 185 self._sampling_rate = info.sample_rate 186 total_frames = info.num_frames 187 188 # Convert offset_in_sec and duration_in_sec to frame indices. 189 frame_offset = int(self._offset_in_sec * self.sampling_rate) 190 if frame_offset > total_frames: 191 raise ValueError( 192 f"Offset ({self._offset_in_sec} s) exceeds the audio file duration " 193 f"({total_frames / self.sampling_rate:.2f} s)." 194 ) 195 if self._duration_in_sec is not None and self._duration_in_sec > 0: 196 num_frames = int(self._duration_in_sec * self.sampling_rate) 197 # Ensure we don't exceed the file length. 198 num_frames = min(num_frames, total_frames - frame_offset) 199 else: 200 num_frames = -1 # Indicates full file reading 201 202 array, _ = torchaudio.load( 203 filepath, 204 frame_offset=frame_offset, 205 num_frames=num_frames, 206 backend=self._backend, 207 ) 208 return array 209 210 def filepath(self) -> Union[str, None]: 211 """Returns the file path of the audio if available.""" 212 if self._file_path: 213 return str(self._file_path) 214 return None 215 216 def generate_id(self) -> str: 217 """Generates a unique identifier for the Audio. 218 219 The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate. 220 221 Returns: 222 A string representing the generated unique identifier. 223 """ 224 # Use the waveform property so that lazy loading is triggered if needed. 225 unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate)) 226 return str(unique_hash) 227 228 def __eq__(self, other: object) -> bool: 229 """Overrides equality to compare Audio objects based on their generated identifiers. 230 231 Args: 232 other: Another object to compare. 233 234 Returns: 235 True if both Audio instances have the same generated identifier, False otherwise. 236 """ 237 if isinstance(other, Audio): 238 return self.generate_id() == other.generate_id() 239 return False 240 241 def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]: 242 """Creates a sliding window generator for the audio waveform. 243 244 Each yielded Audio instance corresponds to a window of the waveform. 245 246 Args: 247 window_size: Number of samples in each window. 248 step_size: Number of samples to advance for each window. 249 250 Yields: 251 Audio: A new Audio instance representing the current window. 252 """ 253 if step_size > window_size: 254 warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.") 255 256 num_samples = self.waveform.size(-1) 257 current_position = 0 258 259 while current_position < num_samples: 260 end_position = min(current_position + window_size, num_samples) 261 window_waveform = self.waveform[:, current_position:end_position] 262 263 yield Audio( 264 waveform=window_waveform, 265 sampling_rate=self.sampling_rate, 266 metadata=self.metadata, 267 ) 268 current_position += step_size 269 270 def save_to_file( 271 self, 272 file_path: Union[str, os.PathLike], 273 format: Optional[str] = None, 274 encoding: Optional[str] = None, 275 bits_per_sample: Optional[int] = None, 276 buffer_size: int = 4096, 277 backend: Optional[str] = None, 278 compression: Optional[Union[float, int]] = None, 279 ) -> None: 280 """Saves the Audio object to a file using torchaudio.save. 281 282 Args: 283 file_path: Destination file path. 284 format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None. 285 encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac. 286 bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64). 287 buffer_size: Buffer size in bytes for processing. 288 backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile"). 289 compression: Compression level for supported formats (e.g. mp3, flac, ogg). 290 291 Raises: 292 ModuleNotFoundError: If torchaudio is not available. 293 ValueError: If the waveform dimensions or sampling rate are invalid. 294 RuntimeError: If saving fails. 295 """ 296 if not TORCHAUDIO_AVAILABLE: 297 raise ModuleNotFoundError( 298 "`torchaudio` is not installed. " 299 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 300 ) 301 302 if self.waveform.ndim != 2: 303 raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).") 304 if self.sampling_rate <= 0: 305 raise ValueError("Sampling rate must be a positive integer.") 306 307 output_dir = os.path.dirname(file_path) 308 if not os.access(output_dir, os.W_OK): 309 raise RuntimeError(f"Output directory '{output_dir}' is not writable.") 310 311 try: 312 if not os.path.exists(output_dir): 313 os.makedirs(output_dir) 314 torchaudio.save( 315 uri=file_path, 316 src=self.waveform, 317 sample_rate=self.sampling_rate, 318 channels_first=True, 319 format=format, 320 encoding=encoding, 321 bits_per_sample=bits_per_sample, 322 buffer_size=buffer_size, 323 backend=backend, 324 compression=compression, 325 ) 326 except Exception as e: 327 raise RuntimeError(f"Error saving audio to file: {e}") from e 328 329 @classmethod 330 def from_stream( 331 cls, 332 stream_source: Union[str, os.PathLike, bytes], 333 chunk_duration_in_sec: float = 1.0, 334 metadata: Optional[Dict] = None, 335 ) -> Generator["Audio", None, None]: 336 """Yield Audio objects from a live audio stream in fixed-duration chunks. 337 338 Args: 339 stream_source: A file path, stream, or bytes-like object. 340 chunk_duration_in_sec: Duration (in seconds) of each audio chunk. 341 metadata: Additional metadata for each chunk. 342 343 Yields: 344 Audio objects for each chunk read from the stream. 345 """ 346 if not SOUNDFILE_AVAILABLE: 347 raise ModuleNotFoundError( 348 "`soundfile` is not installed. " 349 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 350 ) 351 352 if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source): 353 raise FileNotFoundError(f"File {stream_source} does not exist.") 354 355 with sf.SoundFile(stream_source, "r") as audio_file: 356 sampling_rate = audio_file.samplerate 357 chunk_frames = int(chunk_duration_in_sec * sampling_rate) 358 359 while True: 360 chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True) 361 if chunk.shape[0] == 0: 362 break 363 yield cls( 364 waveform=chunk.T, 365 sampling_rate=sampling_rate, 366 metadata=metadata if metadata else {}, 367 )
Represents an audio file and its associated metadata.
Users should instantiate Audio via the constructor (from file path or waveform + sampling rate) or the 'from_stream' method, which yiels Audio objects from a live audio stream.
Attributes:
- metadata: A dictionary containing any additional metadata.
52 def __init__(self, **data: Any) -> None: # noqa: ANN401,D417 53 """Initialize an Audio instance. 54 55 Args: 56 waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor. 57 sampling_rate (optional): If provided, sets the sampling rate. 58 This must be provided if a waveform is supplied. 59 filepath (optional): File path for lazy loading the waveform if not provided. 60 offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0. 61 duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded. 62 backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile"). 63 metadata (optional): A dictionary of additional metadata. 64 65 Raises: 66 ValueError: If neither waveform nor filepath is provided. 67 """ 68 waveform = data.pop("waveform", None) 69 provided_sr = data.pop("sampling_rate", None) 70 filepath = data.pop("filepath", None) 71 offset_in_sec = data.pop("offset_in_sec", 0.0) 72 duration_in_sec = data.pop("duration_in_sec", None) 73 backend = data.pop("backend", None) 74 metadata = data.pop("metadata", {}) 75 76 super().__init__(**data) 77 78 if waveform is not None: 79 # If a waveform and sampling rate are provided, convert and store them; 80 if provided_sr is None: 81 raise ValueError("When a waveform is provided, a sampling_rate must also be supplied.") 82 self._waveform = self.convert_to_tensor(waveform) 83 self._sampling_rate = provided_sr 84 else: 85 # otherwise, a valid filepath is required for lazy loading. 86 if not filepath: 87 raise ValueError("Either a waveform or a valid filepath must be provided to construct an Audio object.") 88 elif not os.path.exists(filepath): 89 raise FileNotFoundError(f"File {filepath} does not exist.") 90 else: 91 self._file_path = filepath 92 93 # Validate offset 94 if offset_in_sec < 0: 95 raise ValueError("Offset must be a non-negative value") 96 97 # Validate duration (allowing -1 to indicate full file) 98 if duration_in_sec is not None and duration_in_sec < 0 and duration_in_sec != -1: 99 raise ValueError("Duration must be -1 (for full file) or a positive value") 100 101 # Validate backend if provided 102 allowed_backends = {"ffmpeg", "sox", "soundfile"} 103 if backend is not None and backend not in allowed_backends: 104 raise ValueError("Unsupported backend") 105 106 self._offset_in_sec = offset_in_sec 107 self._duration_in_sec = duration_in_sec 108 self._backend = backend 109 110 # Set the metadata 111 self.metadata = metadata
Initialize an Audio instance.
Arguments:
- waveform (optional): Pre-loaded audio data as a list, NumPy array, or torch.Tensor.
- sampling_rate (optional): If provided, sets the sampling rate. This must be provided if a waveform is supplied.
- filepath (optional): File path for lazy loading the waveform if not provided.
- offset_in_sec (optional): Offset (in seconds) from which to start reading the file. Defaults to 0.0.
- duration_in_sec (optional): Duration (in seconds) to read from the file. If None, the full file is loaded.
- backend (optional): I/O backend to use when loading the audio (e.g. "ffmpeg", "sox", "soundfile").
- metadata (optional): A dictionary of additional metadata.
Raises:
- ValueError: If neither waveform nor filepath is provided.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
113 @property 114 def waveform(self) -> torch.Tensor: 115 """Returns the audio waveform as a torch.Tensor. 116 117 If the waveform has not been loaded yet, it is loaded lazily from the file. 118 """ 119 if self._waveform is None: 120 # print("Lazy loading audio data from file...") 121 self._waveform = self.convert_to_tensor(self._lazy_load_data_from_filepath(self._file_path)) 122 assert self._waveform is not None, "Failed to load audio data." 123 return self._waveform
Returns the audio waveform as a torch.Tensor.
If the waveform has not been loaded yet, it is loaded lazily from the file.
125 @property 126 def sampling_rate(self) -> int: 127 """Returns the sampling rate of the audio. 128 129 If the sampling rate is not set and a file is available, it is inferred from the file metadata. 130 """ 131 if self._sampling_rate is None: 132 if self._file_path and TORCHAUDIO_AVAILABLE: 133 info = torchaudio.info(self._file_path) 134 self._sampling_rate = info.sample_rate 135 else: 136 raise ValueError("Sampling rate is not available.") 137 assert self._sampling_rate is not None, "Sampling rate should be set." 138 return self._sampling_rate
Returns the sampling rate of the audio.
If the sampling rate is not set and a file is available, it is inferred from the file metadata.
140 @classmethod 141 def convert_to_tensor(cls, v: Union[List[float], List[List[float]], np.ndarray, torch.Tensor]) -> torch.Tensor: 142 """Converts input audio data to a torch.Tensor with shape (num_channels, num_samples). 143 144 Args: 145 v: Audio data in the form of a list, NumPy array, or torch.Tensor. 146 147 Returns: 148 A torch.Tensor representation of the audio data. 149 """ 150 if isinstance(v, list): 151 temporary_tensor = torch.tensor(v) 152 elif isinstance(v, np.ndarray): 153 temporary_tensor = torch.tensor(v) 154 elif isinstance(v, torch.Tensor): 155 temporary_tensor = v.clone() 156 else: 157 raise ValueError("Unsupported data type for audio conversion.") 158 159 if temporary_tensor.ndim == 1: 160 temporary_tensor = temporary_tensor.unsqueeze(0) 161 return temporary_tensor.to(torch.float32)
Converts input audio data to a torch.Tensor with shape (num_channels, num_samples).
Arguments:
- v: Audio data in the form of a list, NumPy array, or torch.Tensor.
Returns:
A torch.Tensor representation of the audio data.
210 def filepath(self) -> Union[str, None]: 211 """Returns the file path of the audio if available.""" 212 if self._file_path: 213 return str(self._file_path) 214 return None
Returns the file path of the audio if available.
216 def generate_id(self) -> str: 217 """Generates a unique identifier for the Audio. 218 219 The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate. 220 221 Returns: 222 A string representing the generated unique identifier. 223 """ 224 # Use the waveform property so that lazy loading is triggered if needed. 225 unique_hash = uuid.uuid3(uuid.uuid3(SENSELAB_NAMESPACE, str(self.waveform)), str(self.sampling_rate)) 226 return str(unique_hash)
Generates a unique identifier for the Audio.
The identifier is computed as an MD5-based UUID derived from the waveform and sampling rate.
Returns:
A string representing the generated unique identifier.
241 def window_generator(self, window_size: int, step_size: int) -> Generator["Audio", None, None]: 242 """Creates a sliding window generator for the audio waveform. 243 244 Each yielded Audio instance corresponds to a window of the waveform. 245 246 Args: 247 window_size: Number of samples in each window. 248 step_size: Number of samples to advance for each window. 249 250 Yields: 251 Audio: A new Audio instance representing the current window. 252 """ 253 if step_size > window_size: 254 warnings.warn("Step size is greater than window size. Some portions of the audio may not be included.") 255 256 num_samples = self.waveform.size(-1) 257 current_position = 0 258 259 while current_position < num_samples: 260 end_position = min(current_position + window_size, num_samples) 261 window_waveform = self.waveform[:, current_position:end_position] 262 263 yield Audio( 264 waveform=window_waveform, 265 sampling_rate=self.sampling_rate, 266 metadata=self.metadata, 267 ) 268 current_position += step_size
Creates a sliding window generator for the audio waveform.
Each yielded Audio instance corresponds to a window of the waveform.
Arguments:
- window_size: Number of samples in each window.
- step_size: Number of samples to advance for each window.
Yields:
Audio: A new Audio instance representing the current window.
270 def save_to_file( 271 self, 272 file_path: Union[str, os.PathLike], 273 format: Optional[str] = None, 274 encoding: Optional[str] = None, 275 bits_per_sample: Optional[int] = None, 276 buffer_size: int = 4096, 277 backend: Optional[str] = None, 278 compression: Optional[Union[float, int]] = None, 279 ) -> None: 280 """Saves the Audio object to a file using torchaudio.save. 281 282 Args: 283 file_path: Destination file path. 284 format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None. 285 encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac. 286 bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64). 287 buffer_size: Buffer size in bytes for processing. 288 backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile"). 289 compression: Compression level for supported formats (e.g. mp3, flac, ogg). 290 291 Raises: 292 ModuleNotFoundError: If torchaudio is not available. 293 ValueError: If the waveform dimensions or sampling rate are invalid. 294 RuntimeError: If saving fails. 295 """ 296 if not TORCHAUDIO_AVAILABLE: 297 raise ModuleNotFoundError( 298 "`torchaudio` is not installed. " 299 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 300 ) 301 302 if self.waveform.ndim != 2: 303 raise ValueError("Waveform must be a 2D tensor with shape (num_channels, num_samples).") 304 if self.sampling_rate <= 0: 305 raise ValueError("Sampling rate must be a positive integer.") 306 307 output_dir = os.path.dirname(file_path) 308 if not os.access(output_dir, os.W_OK): 309 raise RuntimeError(f"Output directory '{output_dir}' is not writable.") 310 311 try: 312 if not os.path.exists(output_dir): 313 os.makedirs(output_dir) 314 torchaudio.save( 315 uri=file_path, 316 src=self.waveform, 317 sample_rate=self.sampling_rate, 318 channels_first=True, 319 format=format, 320 encoding=encoding, 321 bits_per_sample=bits_per_sample, 322 buffer_size=buffer_size, 323 backend=backend, 324 compression=compression, 325 ) 326 except Exception as e: 327 raise RuntimeError(f"Error saving audio to file: {e}") from e
Saves the Audio object to a file using torchaudio.save.
Arguments:
- file_path: Destination file path.
- format: Audio format (e.g. "wav", "ogg", "flac"). Inferred from the file extension if None.
- encoding: Encoding to use (e.g. "PCM_S", "PCM_U"). Effective for formats like wav and flac.
- bits_per_sample: Bit depth (e.g. 8, 16, 24, 32, 64).
- buffer_size: Buffer size in bytes for processing.
- backend: I/O backend to use (e.g. "ffmpeg", "sox", "soundfile").
- compression: Compression level for supported formats (e.g. mp3, flac, ogg).
Raises:
- ModuleNotFoundError: If torchaudio is not available.
- ValueError: If the waveform dimensions or sampling rate are invalid.
- RuntimeError: If saving fails.
329 @classmethod 330 def from_stream( 331 cls, 332 stream_source: Union[str, os.PathLike, bytes], 333 chunk_duration_in_sec: float = 1.0, 334 metadata: Optional[Dict] = None, 335 ) -> Generator["Audio", None, None]: 336 """Yield Audio objects from a live audio stream in fixed-duration chunks. 337 338 Args: 339 stream_source: A file path, stream, or bytes-like object. 340 chunk_duration_in_sec: Duration (in seconds) of each audio chunk. 341 metadata: Additional metadata for each chunk. 342 343 Yields: 344 Audio objects for each chunk read from the stream. 345 """ 346 if not SOUNDFILE_AVAILABLE: 347 raise ModuleNotFoundError( 348 "`soundfile` is not installed. " 349 "Please install senselab audio dependencies using `pip install 'senselab[audio]'`." 350 ) 351 352 if isinstance(stream_source, (os.PathLike, str)) and not os.path.exists(stream_source): 353 raise FileNotFoundError(f"File {stream_source} does not exist.") 354 355 with sf.SoundFile(stream_source, "r") as audio_file: 356 sampling_rate = audio_file.samplerate 357 chunk_frames = int(chunk_duration_in_sec * sampling_rate) 358 359 while True: 360 chunk = audio_file.read(frames=chunk_frames, dtype="float32", always_2d=True) 361 if chunk.shape[0] == 0: 362 break 363 yield cls( 364 waveform=chunk.T, 365 sampling_rate=sampling_rate, 366 metadata=metadata if metadata else {}, 367 )
Yield Audio objects from a live audio stream in fixed-duration chunks.
Arguments:
- stream_source: A file path, stream, or bytes-like object.
- chunk_duration_in_sec: Duration (in seconds) of each audio chunk.
- metadata: Additional metadata for each chunk.
Yields:
Audio objects for each chunk read from the stream.
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Arguments:
- self: The BaseModel instance.
- context: The context.
370def batch_audios(audios: List[Audio]) -> Tuple[torch.Tensor, Union[int, List[int]], List[Dict]]: 371 """Batches a list of Audio objects into a single Tensor while preserving individual metadata. 372 373 Args: 374 audios: List of Audio objects. They should all have the same number of channels. 375 It is advised that they also share the same sampling rate when required by processing. 376 377 Returns: 378 A tuple containing: 379 - A Tensor of shape (batch_size, num_channels, num_samples), 380 - The sampling rate (as an integer if uniform, or a list otherwise), 381 - A list of each audio's metadata. 382 383 Raises: 384 RuntimeError: If the Audio objects do not share the same number of channels. 385 """ 386 sampling_rates = [] 387 num_channels_list = [] 388 lengths = [] 389 batched_audio = [] 390 metadatas = [] 391 392 for audio in audios: 393 sampling_rates.append(audio.sampling_rate) 394 num_channels_list.append(audio.waveform.shape[0]) 395 lengths.append(audio.waveform.shape[1]) 396 metadatas.append(audio.metadata) 397 398 if len(set(num_channels_list)) != 1: 399 raise RuntimeError("All audios must have the same number of channels.") 400 401 if len(set(sampling_rates)) != 1: 402 warnings.warn("Not all sampling rates are the same.", UserWarning) 403 404 max_length = max(lengths) 405 for audio in audios: 406 waveform = audio.waveform 407 padding = max_length - waveform.shape[1] 408 if padding > 0: 409 pad = torch.zeros((waveform.shape[0], padding), dtype=waveform.dtype) 410 waveform = torch.cat([waveform, pad], dim=1) 411 batched_audio.append(waveform) 412 413 return_sampling_rate: Union[int, List[int]] = ( 414 int(sampling_rates[0]) if len(set(sampling_rates)) == 1 else sampling_rates 415 ) 416 417 return torch.stack(batched_audio), return_sampling_rate, metadatas
Batches a list of Audio objects into a single Tensor while preserving individual metadata.
Arguments:
- audios: List of Audio objects. They should all have the same number of channels. It is advised that they also share the same sampling rate when required by processing.
Returns:
A tuple containing: - A Tensor of shape (batch_size, num_channels, num_samples), - The sampling rate (as an integer if uniform, or a list otherwise), - A list of each audio's metadata.
Raises:
- RuntimeError: If the Audio objects do not share the same number of channels.
420def unbatch_audios( 421 batched_audio: torch.Tensor, sampling_rates: Union[int, List[int]], metadatas: List[Dict] 422) -> List[Audio]: 423 """Unbatches a Tensor of audio data back into a list of Audio objects. 424 425 Args: 426 batched_audio: Tensor of shape (batch_size, num_channels, num_samples). 427 sampling_rates: A single sampling rate (if uniform) or a list of sampling rates. 428 metadatas: A list of metadata dictionaries for each audio. 429 430 Returns: 431 A list of Audio objects reconstituted from the batched data. 432 433 Raises: 434 ValueError: If the batched_audio shape is invalid or if the number of items mismatches. 435 """ 436 if len(batched_audio.shape) != 3: 437 raise ValueError("Expected batched_audio to have shape (batch_size, num_channels, num_samples).") 438 if batched_audio.shape[0] != len(metadatas) or ( 439 isinstance(sampling_rates, list) and batched_audio.shape[0] != len(sampling_rates) 440 ): 441 raise ValueError("Batch size, sampling_rates, and metadatas must all have the same number of elements.") 442 443 audios = [] 444 for i in range(len(metadatas)): 445 sr = sampling_rates[i] if isinstance(sampling_rates, list) else sampling_rates 446 audios.append(Audio(waveform=batched_audio[i], sampling_rate=sr, metadata=metadatas[i])) 447 return audios
Unbatches a Tensor of audio data back into a list of Audio objects.
Arguments:
- batched_audio: Tensor of shape (batch_size, num_channels, num_samples).
- sampling_rates: A single sampling rate (if uniform) or a list of sampling rates.
- metadatas: A list of metadata dictionaries for each audio.
Returns:
A list of Audio objects reconstituted from the batched data.
Raises:
- ValueError: If the batched_audio shape is invalid or if the number of items mismatches.