senselab.audio.tasks.features_extraction.api

High-level feature extraction for audio.

This module aggregates multiple feature backends—OpenSMILE, Praat/Parselmouth, torchaudio, and torchaudio-squim—into a single convenience function. Each backend can be toggled on/off or configured independently.

Backends:
  • OpenSMILE: Robust, hand-crafted descriptors (e.g., eGeMAPS).
  • Praat/Parselmouth: Prosody and voice-quality measures (pitch, jitter, shimmer, formants, etc.).
  • torchaudio: Spectral features (spectrogram, mel, MFCC, pitch).
  • torchaudio-squim: Objective quality metrics (e.g., STOI, PESQ, SI-SDR).
  1"""High-level feature extraction for audio.
  2
  3This module aggregates multiple feature backends—OpenSMILE, Praat/Parselmouth,
  4torchaudio, and torchaudio-squim—into a single convenience function. Each
  5backend can be toggled on/off or configured independently.
  6
  7Backends:
  8    - **OpenSMILE**: Robust, hand-crafted descriptors (e.g., eGeMAPS).
  9    - **Praat/Parselmouth**: Prosody and voice-quality measures
 10        (pitch, jitter, shimmer, formants, etc.).
 11    - **torchaudio**: Spectral features (spectrogram, mel, MFCC, pitch).
 12    - **torchaudio-squim**: Objective quality metrics (e.g., STOI, PESQ, SI-SDR).
 13"""
 14
 15from typing import Any, Dict, List, Union
 16
 17from senselab.audio.data_structures import Audio
 18
 19from .opensmile import extract_opensmile_features_from_audios
 20from .praat_parselmouth import extract_praat_parselmouth_features_from_audios
 21from .torchaudio import extract_torchaudio_features_from_audios
 22from .torchaudio_squim import extract_objective_quality_features_from_audios
 23
 24
 25def extract_features_from_audios(
 26    audios: List[Audio],
 27    opensmile: Union[Dict[str, str], bool] = True,
 28    parselmouth: Union[Dict[str, str], bool] = True,
 29    torchaudio: Union[Dict[str, str], bool] = True,
 30    torchaudio_squim: bool = True,
 31) -> List[Dict[str, Any]]:
 32    """Extract multi-backend features for each `Audio` and return a list of dicts.
 33
 34    Enabled backends run in parallelizable sub-workflows (where applicable) and
 35    their outputs are merged per audio. Disable any backend by passing ``False``;
 36    customize a backend by passing a dict (see below for keys and defaults).
 37
 38    Args:
 39        audios (list[Audio]):
 40            Input audio objects.
 41        opensmile (dict | bool, optional):
 42            - ``False`` → skip OpenSMILE.
 43            - ``True``  → use defaults:
 44                ``{"feature_set": "eGeMAPSv02", "feature_level": "Functionals"}``
 45            - ``dict`` → override any of the above keys. `feature_set` and `feature_level`
 46              should match OpenSMILE presets.
 47        parselmouth (dict | bool, optional):
 48            - ``False`` → skip Praat/Parselmouth.
 49            - ``True``  → use defaults (pitch, intensity, jitter, shimmer, formants, etc. enabled):
 50                ``{"time_step": 0.005, "window_length": 0.025, "pitch_unit": "Hertz",
 51                  "speech_rate": True, "intensity_descriptors": True,
 52                  "harmonicity_descriptors": True, "formants": True, "spectral_moments": True,
 53                  "pitch": True, "slope_tilt": True, "cpp_descriptors": True, "duration": True,
 54                  "jitter": True, "shimmer": True}``
 55            - ``dict`` → override any of the above keys.
 56        torchaudio (dict | bool, optional):
 57            - ``False`` → skip torchaudio.
 58            - ``True``  → use defaults:
 59                ``{"freq_low": 80, "freq_high": 500, "n_fft": 1024, "n_mels": 128,
 60                  "n_mfcc": 40, "win_length": None, "hop_length": None}``
 61            - ``dict`` → override any of the above keys (e.g., ``n_fft``, ``hop_length``).
 62        torchaudio_squim (bool, optional):
 63            - ``False`` → skip objective quality metrics.
 64            - ``True``  → compute metrics such as STOI, PESQ, SI-SDR (backend-dependent defaults).
 65
 66    Returns:
 67        list[dict[str, Any]]: One dict per input audio. Keys present depend on
 68        enabled backends; typical structure:
 69
 70        - ``"opensmile"`` → ``dict[str, float]`` of aggregated descriptors.
 71        - ``"praat_parselmouth"`` → ``dict[str, float]`` (prosody/voice-quality).
 72        - ``"torchaudio"`` → nested ``dict[str, Tensor]`` (e.g., ``spectrogram``,
 73          ``mel_spectrogram``, ``mfcc``, ``pitch``). Tensors have shapes defined
 74          by your STFT/mel/MFCC settings.
 75        - ``"torchaudio_squim"`` → ``dict[str, float]`` with objective quality scores.
 76
 77    Raises:
 78        ModuleNotFoundError:
 79            If a requested backend library is not installed (e.g., `opensmile`,
 80            `praat-parselmouth`, or dependencies required by torchaudio-squim).
 81        ValueError:
 82            If invalid parameter combinations are provided to a backend.
 83
 84    Tips:
 85        - **Memory**: Torchaudio tensors (spectrograms, mels) can be large. Convert or
 86          downsample if you only need summary stats.
 87
 88    Example (all defaults):
 89        >>> from pathlib import Path
 90        >>> from senselab.audio.data_structures import Audio
 91        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
 92        >>> feats = extract_features_from_audios([a1])
 93        >>> sorted(feats[0].keys())
 94        ['opensmile', 'praat_parselmouth', 'torchaudio', 'torchaudio_squim']
 95
 96    Example (all defaults II):
 97        >>> from senselab.audio.data_structures import Audio
 98        >>> from senselab.audio.tasks.features_extraction import extract_features_from_audios
 99        >>> from pathlib import Path
100        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
101        >>> extract_features_from_audios([a1])
102        [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172,
103        'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103,
104        'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266,
105        'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084,
106        'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344,
107        'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078,
108        'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578,
109        'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047,
110        'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945,
111        'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227,
112        'loudness_sma3_amean': 0.86087566614151,
113        'loudness_sma3_stddevNorm': 0.43875235319137573,
114        'loudness_sma3_percentile20.0': 0.5877408981323242,
115        'loudness_sma3_percentile50.0': 0.8352401852607727,
116        'loudness_sma3_percentile80.0': 1.1747918128967285,
117        'loudness_sma3_pctlrange0-2': 0.5870509147644043,
118        'loudness_sma3_meanRisingSlope': 10.285204887390137,
119        'loudness_sma3_stddevRisingSlope': 7.544795513153076,
120        'loudness_sma3_meanFallingSlope': 7.612527370452881,
121        'loudness_sma3_stddevFallingSlope': 4.15903902053833,
122        'spectralFlux_sma3_amean': 0.3213598430156708,
123        'spectralFlux_sma3_stddevNorm': 0.6921582818031311,
124        'mfcc1_sma3_amean': 10.274803161621094,
125        'mfcc1_sma3_stddevNorm': 1.1581648588180542,
126        'mfcc2_sma3_amean': 4.262020111083984,
127        'mfcc2_sma3_stddevNorm': 2.052302837371826,
128        'mfcc3_sma3_amean': 7.624598026275635,
129        'mfcc3_sma3_stddevNorm': 1.4570358991622925,
130        'mfcc4_sma3_amean': 3.6676177978515625,
131        'mfcc4_sma3_stddevNorm': 2.6902272701263428,
132        'jitterLocal_sma3nz_amean': 0.019597552716732025,
133        'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774,
134        'shimmerLocaldB_sma3nz_amean': 1.264746069908142,
135        'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757,
136        'HNRdBACF_sma3nz_amean': 3.6400067806243896,
137        'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499,
138        'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274,
139        'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456,
140        'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812,
141        'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816,
142        'F1frequency_sma3nz_amean': 665.1713256835938,
143        'F1frequency_sma3nz_stddevNorm': 0.41958823800086975,
144        'F1bandwidth_sma3nz_amean': 1300.2757568359375,
145        'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802,
146        'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672,
147        'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125,
148        'F2frequency_sma3nz_amean': 1657.013916015625,
149        'F2frequency_sma3nz_stddevNorm': 0.17019854485988617,
150        'F2bandwidth_sma3nz_amean': 1105.7457275390625,
151        'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774,
152        'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094,
153        'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417,
154        'F3frequency_sma3nz_amean': 2601.6630859375,
155        'F3frequency_sma3nz_stddevNorm': 0.11457356810569763,
156        'F3bandwidth_sma3nz_amean': 1091.15087890625,
157        'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872,
158        'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625,
159        'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729,
160        'alphaRatioV_sma3nz_amean': -8.626543045043945,
161        'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326,
162        'hammarbergIndexV_sma3nz_amean': 16.796842575073242,
163        'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323,
164        'slopeV0-500_sma3nz_amean': 0.021949246525764465,
165        'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826,
166        'slopeV500-1500_sma3nz_amean': -0.008139753714203835,
167        'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528,
168        'spectralFluxV_sma3nz_amean': 0.4831695556640625,
169        'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483,
170        'mfcc1V_sma3nz_amean': 20.25444793701172,
171        'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944,
172        'mfcc2V_sma3nz_amean': 3.619405746459961,
173        'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438,
174        'mfcc3V_sma3nz_amean': 7.736487865447998,
175        'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037,
176        'mfcc4V_sma3nz_amean': 4.60503625869751,
177        'mfcc4V_sma3nz_stddevNorm': 2.864668846130371,
178        'alphaRatioUV_sma3nz_amean': -2.5990121364593506,
179        'hammarbergIndexUV_sma3nz_amean': 8.862899780273438,
180        'slopeUV0-500_sma3nz_amean': 0.002166695659980178,
181        'slopeUV500-1500_sma3nz_amean': 0.006735736038535833,
182        'spectralFluxUV_sma3nz_amean': 0.24703539907932281,
183        'loudnessPeaksPerSec': 3.8834950923919678,
184        'VoicedSegmentsPerSec': 2.745098114013672,
185        'MeanVoicedSegmentLengthSec': 0.12214285880327225,
186        'StddevVoicedSegmentLengthSec': 0.09025190770626068,
187        'MeanUnvoicedSegmentLength': 0.20666664838790894,
188        'StddevUnvoicedSegmentLength': 0.17666037380695343,
189        'equivalentSoundLevel_dBp': -24.297256469726562},
190        'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963,
191                140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448,
192                110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752,
193                124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603,
194                106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752,
195                101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000,  98.1595,
196                    98.1595,  98.1595,  95.8084,  95.8084,  95.8084,  95.2381,  95.2381,
197                    94.6746,  91.9540,  91.9540,  91.9540,  91.9540,  91.9540,  91.4286,
198                    91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  90.9091,
199                    90.9091,  90.9091,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,
200                    91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,
201                    91.4286,  91.9540,  91.9540,  93.0233,  93.5673,  93.5673,  94.1176,
202                    94.6746,  94.6746,  94.6746,  95.8084,  96.3855,  96.9697, 100.0000,
203                100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752,
204                104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883,
205                116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121,
206                121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374,
207                123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769,
208                122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471,
209                117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632,
210                105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632,
211                108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632,
212                105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121,
213                121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374,
214                122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769,
215                123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752,
216                106.6667, 103.2258, 103.2258,  96.9697,  96.9697,  96.9697,  96.9697,
217                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,
218                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,
219                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  97.5610,  97.5610,
220                    97.5610,  97.5610,  97.5610,  98.1595, 100.0000, 100.6289, 100.6289,
221                100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658,
222                101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658,  97.5610,
223                    90.9091,  89.8876,  88.8889,  88.8889,  88.3978,  87.4317,  86.0215,
224                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
225                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
226                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
227                    86.0215,  86.0215,  86.0215,  86.0215,  86.4865,  86.4865,  86.4865,
228                    86.4865,  86.4865,  87.4317,  87.9121,  87.9121,  87.9121,  89.8876,
229                    90.9091,  90.9091,  90.9091,  90.9091,  90.9091,  91.4286,  91.4286,
230                    91.4286,  92.4855,  92.4855,  93.0233,  93.0233,  93.0233,  93.5673,
231                    93.5673,  95.2381,  95.2381, 100.0000, 101.9108, 112.6761, 112.6761,
232                112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841,
233                130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813,
234                130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843,
235                125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841,
236                130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538,
237                134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932,
238                135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929,
239                144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929,
240                141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000,
241                160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485,
242                164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485,
243                164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398,
244                155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481,
245                148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
246                148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
247                146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462,
248                153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462,
249                153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462,
250                153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810,
251                152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481,
252                148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545,
253                145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571,
254                142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441,
255                145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890,
256                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
257                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545,
258                145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545,
259                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
260                400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485,
261                484.8485, 484.8485, 484.8485, 484.8485, 484.8485]),
262        'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02,  ..., 0.0000e+00, 0.0000e+00,
263                    0.0000e+00],
264                [3.0977e-04, 1.5698e-02, 1.5785e-02,  ..., 0.0000e+00, 0.0000e+00,
265                    0.0000e+00],
266                [8.2318e-05, 1.4367e-02, 2.8095e-01,  ..., 0.0000e+00, 0.0000e+00,
267                    0.0000e+00],
268                ...,
269                [3.6322e-05, 9.7330e-03, 5.4812e-02,  ..., 0.0000e+00, 0.0000e+00,
270                    0.0000e+00],
271                [2.2802e-05, 1.2481e-02, 5.8374e-02,  ..., 0.0000e+00, 0.0000e+00,
272                    0.0000e+00],
273                [5.3029e-05, 3.1305e-02, 7.9842e-02,  ..., 0.0000e+00, 0.0000e+00,
274                    0.0000e+00]]),
275        'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02,  ..., -6.3893e+02,
276                    -6.3893e+02, -6.3893e+02],
277                [ 1.3593e+01,  1.9928e+01,  2.6022e+01,  ...,  3.9824e-05,
278                    3.9824e-05,  3.9824e-05],
279                [ 7.3933e+00, -2.1680e+01, -1.4259e+01,  ..., -1.3440e-05,
280                    -1.3440e-05, -1.3440e-05],
281                ...,
282                [ 1.8122e+00, -3.1072e+00, -3.7336e+00,  ...,  7.0669e-05,
283                    7.0669e-05,  7.0669e-05],
284                [-2.7518e-01, -9.4738e+00, -2.3157e+00,  ..., -1.7963e-04,
285                    -1.7963e-04, -1.7963e-04],
286                [ 2.3144e-01, -6.4129e+00, -8.4420e+00,  ..., -1.5891e-04,
287                    -1.5891e-04, -1.5891e-04]]),
288        'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02,  ..., 0.0000e+00, 0.0000e+00,
289                    0.0000e+00],
290                [3.0977e-04, 1.5698e-02, 1.5785e-02,  ..., 0.0000e+00, 0.0000e+00,
291                    0.0000e+00],
292                [8.2318e-05, 1.4367e-02, 2.8095e-01,  ..., 0.0000e+00, 0.0000e+00,
293                    0.0000e+00],
294                ...,
295                [3.6322e-05, 9.7330e-03, 5.4812e-02,  ..., 0.0000e+00, 0.0000e+00,
296                    0.0000e+00],
297                [2.2802e-05, 1.2481e-02, 5.8374e-02,  ..., 0.0000e+00, 0.0000e+00,
298                    0.0000e+00],
299                [5.3029e-05, 3.1305e-02, 7.9842e-02,  ..., 0.0000e+00, 0.0000e+00,
300                    0.0000e+00]]),
301        'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02,  ..., 0.0000e+00, 0.0000e+00,
302                    0.0000e+00],
303                [5.0707e-04, 1.1670e-02, 1.5016e-02,  ..., 0.0000e+00, 0.0000e+00,
304                    0.0000e+00],
305                [3.1901e-04, 1.8529e-02, 1.8078e-02,  ..., 0.0000e+00, 0.0000e+00,
306                    0.0000e+00],
307                ...,
308                [1.0302e-05, 3.5917e-03, 2.7169e-03,  ..., 0.0000e+00, 0.0000e+00,
309                    0.0000e+00],
310                [9.6637e-08, 1.3364e-03, 1.8495e-02,  ..., 0.0000e+00, 0.0000e+00,
311                    0.0000e+00],
312                [1.4414e-05, 1.0598e-04, 2.8004e-02,  ..., 0.0000e+00, 0.0000e+00,
313                    0.0000e+00]])},
314        'parselmouth': ({'duration': 5.1613125,
315            'speaking_rate': 3.874983349680919,
316            'articulation_rate': 3.874983349680919,
317            'phonation_ratio': 1.0,
318            'pause_rate': 0.0,
319            'mean_pause_duration': 0.0,
320            'mean_f0_hertz': 118.59917806814313,
321            'std_f0_hertz': 30.232960797931817,
322            'mean_intensity_db': 69.76277128148347,
323            'std_intensity_db': 58.54414165935646,
324            'range_ratio_intensity_db': -0.25736445047981316,
325            'pitch_floor': 60.0,
326            'pitch_ceiling': 250.0,
327            'mean_hnr_db': 3.3285614070654375,
328            'std_hnr_db': 3.36490968797237,
329            'spectral_slope': -13.982306776816046,
330            'spectral_tilt': -0.004414961849917737,
331            'cepstral_peak_prominence_mean': 7.0388038514346825,
332            'cepstral_peak_prominence_std': 1.5672438573255245,
333            'mean_f1_loc': 613.4664268420964,
334            'std_f1_loc': 303.98235579059883,
335            'mean_b1_loc': 401.96960219300837,
336            'std_b1_loc': 400.9001719378358,
337            'mean_f2_loc': 1701.7755281579418,
338            'std_f2_loc': 325.4405394017738,
339            'mean_b2_loc': 434.542188503193,
340            'std_b2_loc': 380.8914612651878,
341            'spectral_gravity': 579.587511962247,
342            'spectral_std_dev': 651.3025011919739,
343            'spectral_skewness': 3.5879707548251045,
344            'spectral_kurtosis': 19.991495997865282,
345            'local_jitter': 0.02553484151620524,
346            'localabsolute_jitter': 0.00021392842618599855,
347            'rap_jitter': 0.012174051087556429,
348            'ppq5_jitter': 0.01597797849248675,
349            'ddp_jitter': 0.03652215326266929,
350            'local_shimmer': 0.1530474665829716,
351            'localDB_shimmer': 1.3511061323188314,
352            'apq3_shimmer': 0.0702984931637734,
353            'apq5_shimmer': 0.09680154282272849,
354            'apq11_shimmer': 0.19065409516266155,
355            'dda_shimmer': 0.2108954794913202},),
356        'torchaudio_squim': {'stoi': 0.9247563481330872,
357        'pesq': 1.3702949285507202,
358        'si_sdr': 11.71167278289795}}]
359
360    Example (disable OpenSMILE; customize torchaudio):
361        >>> from pathlib import Path
362        >>> from senselab.audio.data_structures import Audio
363        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
364        >>> feats = extract_features_from_audios(
365        ...     [a1],
366        ...     opensmile=False,
367        ...     torchaudio={
368        ...         "n_fft": 2048,
369        ...         "hop_length": 256
370        ...     },
371        ... )
372        >>> "opensmile" in feats[0]
373        False
374
375    Example (Parselmouth only, custom pitch range):
376        >>> from pathlib import Path
377        >>> from senselab.audio.data_structures import Audio
378        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
379        >>> feats = extract_features_from_audios(
380        ...     [a1],
381        ...     opensmile=False,
382        ...     torchaudio=False,
383        ...     torchaudio_squim=False,
384        ...     parselmouth={"pitch_unit": "Hertz"},
385        ... )
386        >>> "praat_parselmouth" in feats[0]
387        True
388    """
389    if opensmile:
390        default_opensmile: Dict[str, Any] = {"feature_set": "eGeMAPSv02", "feature_level": "Functionals"}
391        if isinstance(opensmile, dict):
392            my_opensmile = {**default_opensmile, **opensmile}
393        else:
394            my_opensmile = default_opensmile
395        opensmile_features = extract_opensmile_features_from_audios(audios, **my_opensmile)  # type: ignore
396    if parselmouth:
397        default_parselmouth: Dict[str, Any] = {
398            "time_step": 0.005,
399            "window_length": 0.025,
400            "pitch_unit": "Hertz",
401            "speech_rate": True,
402            "intensity_descriptors": True,
403            "harmonicity_descriptors": True,
404            "formants": True,
405            "spectral_moments": True,
406            "pitch": True,
407            "slope_tilt": True,
408            "cpp_descriptors": True,
409            "duration": True,
410            "jitter": True,
411            "shimmer": True,
412        }
413        # Update default_parselmouth with provided parselmouth dictionary
414        if isinstance(parselmouth, dict):
415            my_parselmouth = {**default_parselmouth, **parselmouth}
416        else:
417            my_parselmouth = default_parselmouth
418
419        parselmouth_features = extract_praat_parselmouth_features_from_audios(audios=audios, **my_parselmouth)  # type: ignore
420
421    if torchaudio:
422        default_torchaudio: Dict[str, Any] = {
423            "freq_low": 80,
424            "freq_high": 500,
425            "n_fft": 1024,
426            "n_mels": 128,
427            "n_mfcc": 40,
428            "win_length": None,
429            "hop_length": None,
430        }
431        if isinstance(torchaudio, dict):
432            my_torchaudio = {**default_torchaudio, **torchaudio}
433        else:
434            my_torchaudio = default_torchaudio
435
436        torchaudio_features = extract_torchaudio_features_from_audios(audios=audios, **my_torchaudio)  # type: ignore
437    if torchaudio_squim:
438        torchaudio_squim_features = extract_objective_quality_features_from_audios(audios=audios)
439
440    results = []
441    for i in range(len(audios)):
442        result = {}
443        if opensmile:
444            result["opensmile"] = opensmile_features[i]
445        if parselmouth:
446            result["praat_parselmouth"] = parselmouth_features[i]
447        if torchaudio:
448            result["torchaudio"] = torchaudio_features[i]
449        if torchaudio_squim:
450            result["torchaudio_squim"] = torchaudio_squim_features[i]
451        results.append(result)
452
453    return results
def extract_features_from_audios( audios: List[senselab.audio.data_structures.audio.Audio], opensmile: Union[Dict[str, str], bool] = True, parselmouth: Union[Dict[str, str], bool] = True, torchaudio: Union[Dict[str, str], bool] = True, torchaudio_squim: bool = True) -> List[Dict[str, Any]]:
 26def extract_features_from_audios(
 27    audios: List[Audio],
 28    opensmile: Union[Dict[str, str], bool] = True,
 29    parselmouth: Union[Dict[str, str], bool] = True,
 30    torchaudio: Union[Dict[str, str], bool] = True,
 31    torchaudio_squim: bool = True,
 32) -> List[Dict[str, Any]]:
 33    """Extract multi-backend features for each `Audio` and return a list of dicts.
 34
 35    Enabled backends run in parallelizable sub-workflows (where applicable) and
 36    their outputs are merged per audio. Disable any backend by passing ``False``;
 37    customize a backend by passing a dict (see below for keys and defaults).
 38
 39    Args:
 40        audios (list[Audio]):
 41            Input audio objects.
 42        opensmile (dict | bool, optional):
 43            - ``False`` → skip OpenSMILE.
 44            - ``True``  → use defaults:
 45                ``{"feature_set": "eGeMAPSv02", "feature_level": "Functionals"}``
 46            - ``dict`` → override any of the above keys. `feature_set` and `feature_level`
 47              should match OpenSMILE presets.
 48        parselmouth (dict | bool, optional):
 49            - ``False`` → skip Praat/Parselmouth.
 50            - ``True``  → use defaults (pitch, intensity, jitter, shimmer, formants, etc. enabled):
 51                ``{"time_step": 0.005, "window_length": 0.025, "pitch_unit": "Hertz",
 52                  "speech_rate": True, "intensity_descriptors": True,
 53                  "harmonicity_descriptors": True, "formants": True, "spectral_moments": True,
 54                  "pitch": True, "slope_tilt": True, "cpp_descriptors": True, "duration": True,
 55                  "jitter": True, "shimmer": True}``
 56            - ``dict`` → override any of the above keys.
 57        torchaudio (dict | bool, optional):
 58            - ``False`` → skip torchaudio.
 59            - ``True``  → use defaults:
 60                ``{"freq_low": 80, "freq_high": 500, "n_fft": 1024, "n_mels": 128,
 61                  "n_mfcc": 40, "win_length": None, "hop_length": None}``
 62            - ``dict`` → override any of the above keys (e.g., ``n_fft``, ``hop_length``).
 63        torchaudio_squim (bool, optional):
 64            - ``False`` → skip objective quality metrics.
 65            - ``True``  → compute metrics such as STOI, PESQ, SI-SDR (backend-dependent defaults).
 66
 67    Returns:
 68        list[dict[str, Any]]: One dict per input audio. Keys present depend on
 69        enabled backends; typical structure:
 70
 71        - ``"opensmile"`` → ``dict[str, float]`` of aggregated descriptors.
 72        - ``"praat_parselmouth"`` → ``dict[str, float]`` (prosody/voice-quality).
 73        - ``"torchaudio"`` → nested ``dict[str, Tensor]`` (e.g., ``spectrogram``,
 74          ``mel_spectrogram``, ``mfcc``, ``pitch``). Tensors have shapes defined
 75          by your STFT/mel/MFCC settings.
 76        - ``"torchaudio_squim"`` → ``dict[str, float]`` with objective quality scores.
 77
 78    Raises:
 79        ModuleNotFoundError:
 80            If a requested backend library is not installed (e.g., `opensmile`,
 81            `praat-parselmouth`, or dependencies required by torchaudio-squim).
 82        ValueError:
 83            If invalid parameter combinations are provided to a backend.
 84
 85    Tips:
 86        - **Memory**: Torchaudio tensors (spectrograms, mels) can be large. Convert or
 87          downsample if you only need summary stats.
 88
 89    Example (all defaults):
 90        >>> from pathlib import Path
 91        >>> from senselab.audio.data_structures import Audio
 92        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
 93        >>> feats = extract_features_from_audios([a1])
 94        >>> sorted(feats[0].keys())
 95        ['opensmile', 'praat_parselmouth', 'torchaudio', 'torchaudio_squim']
 96
 97    Example (all defaults II):
 98        >>> from senselab.audio.data_structures import Audio
 99        >>> from senselab.audio.tasks.features_extraction import extract_features_from_audios
100        >>> from pathlib import Path
101        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
102        >>> extract_features_from_audios([a1])
103        [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172,
104        'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103,
105        'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266,
106        'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084,
107        'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344,
108        'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078,
109        'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578,
110        'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047,
111        'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945,
112        'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227,
113        'loudness_sma3_amean': 0.86087566614151,
114        'loudness_sma3_stddevNorm': 0.43875235319137573,
115        'loudness_sma3_percentile20.0': 0.5877408981323242,
116        'loudness_sma3_percentile50.0': 0.8352401852607727,
117        'loudness_sma3_percentile80.0': 1.1747918128967285,
118        'loudness_sma3_pctlrange0-2': 0.5870509147644043,
119        'loudness_sma3_meanRisingSlope': 10.285204887390137,
120        'loudness_sma3_stddevRisingSlope': 7.544795513153076,
121        'loudness_sma3_meanFallingSlope': 7.612527370452881,
122        'loudness_sma3_stddevFallingSlope': 4.15903902053833,
123        'spectralFlux_sma3_amean': 0.3213598430156708,
124        'spectralFlux_sma3_stddevNorm': 0.6921582818031311,
125        'mfcc1_sma3_amean': 10.274803161621094,
126        'mfcc1_sma3_stddevNorm': 1.1581648588180542,
127        'mfcc2_sma3_amean': 4.262020111083984,
128        'mfcc2_sma3_stddevNorm': 2.052302837371826,
129        'mfcc3_sma3_amean': 7.624598026275635,
130        'mfcc3_sma3_stddevNorm': 1.4570358991622925,
131        'mfcc4_sma3_amean': 3.6676177978515625,
132        'mfcc4_sma3_stddevNorm': 2.6902272701263428,
133        'jitterLocal_sma3nz_amean': 0.019597552716732025,
134        'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774,
135        'shimmerLocaldB_sma3nz_amean': 1.264746069908142,
136        'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757,
137        'HNRdBACF_sma3nz_amean': 3.6400067806243896,
138        'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499,
139        'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274,
140        'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456,
141        'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812,
142        'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816,
143        'F1frequency_sma3nz_amean': 665.1713256835938,
144        'F1frequency_sma3nz_stddevNorm': 0.41958823800086975,
145        'F1bandwidth_sma3nz_amean': 1300.2757568359375,
146        'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802,
147        'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672,
148        'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125,
149        'F2frequency_sma3nz_amean': 1657.013916015625,
150        'F2frequency_sma3nz_stddevNorm': 0.17019854485988617,
151        'F2bandwidth_sma3nz_amean': 1105.7457275390625,
152        'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774,
153        'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094,
154        'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417,
155        'F3frequency_sma3nz_amean': 2601.6630859375,
156        'F3frequency_sma3nz_stddevNorm': 0.11457356810569763,
157        'F3bandwidth_sma3nz_amean': 1091.15087890625,
158        'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872,
159        'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625,
160        'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729,
161        'alphaRatioV_sma3nz_amean': -8.626543045043945,
162        'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326,
163        'hammarbergIndexV_sma3nz_amean': 16.796842575073242,
164        'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323,
165        'slopeV0-500_sma3nz_amean': 0.021949246525764465,
166        'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826,
167        'slopeV500-1500_sma3nz_amean': -0.008139753714203835,
168        'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528,
169        'spectralFluxV_sma3nz_amean': 0.4831695556640625,
170        'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483,
171        'mfcc1V_sma3nz_amean': 20.25444793701172,
172        'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944,
173        'mfcc2V_sma3nz_amean': 3.619405746459961,
174        'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438,
175        'mfcc3V_sma3nz_amean': 7.736487865447998,
176        'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037,
177        'mfcc4V_sma3nz_amean': 4.60503625869751,
178        'mfcc4V_sma3nz_stddevNorm': 2.864668846130371,
179        'alphaRatioUV_sma3nz_amean': -2.5990121364593506,
180        'hammarbergIndexUV_sma3nz_amean': 8.862899780273438,
181        'slopeUV0-500_sma3nz_amean': 0.002166695659980178,
182        'slopeUV500-1500_sma3nz_amean': 0.006735736038535833,
183        'spectralFluxUV_sma3nz_amean': 0.24703539907932281,
184        'loudnessPeaksPerSec': 3.8834950923919678,
185        'VoicedSegmentsPerSec': 2.745098114013672,
186        'MeanVoicedSegmentLengthSec': 0.12214285880327225,
187        'StddevVoicedSegmentLengthSec': 0.09025190770626068,
188        'MeanUnvoicedSegmentLength': 0.20666664838790894,
189        'StddevUnvoicedSegmentLength': 0.17666037380695343,
190        'equivalentSoundLevel_dBp': -24.297256469726562},
191        'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963,
192                140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448,
193                110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752,
194                124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603,
195                106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752,
196                101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000,  98.1595,
197                    98.1595,  98.1595,  95.8084,  95.8084,  95.8084,  95.2381,  95.2381,
198                    94.6746,  91.9540,  91.9540,  91.9540,  91.9540,  91.9540,  91.4286,
199                    91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  90.9091,
200                    90.9091,  90.9091,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,
201                    91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,  91.4286,
202                    91.4286,  91.9540,  91.9540,  93.0233,  93.5673,  93.5673,  94.1176,
203                    94.6746,  94.6746,  94.6746,  95.8084,  96.3855,  96.9697, 100.0000,
204                100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752,
205                104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883,
206                116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121,
207                121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374,
208                123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769,
209                122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471,
210                117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632,
211                105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632,
212                108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632,
213                105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121,
214                121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374,
215                122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769,
216                123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752,
217                106.6667, 103.2258, 103.2258,  96.9697,  96.9697,  96.9697,  96.9697,
218                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,
219                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  96.9697,
220                    96.9697,  96.9697,  96.9697,  96.9697,  96.9697,  97.5610,  97.5610,
221                    97.5610,  97.5610,  97.5610,  98.1595, 100.0000, 100.6289, 100.6289,
222                100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658,
223                101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658,  97.5610,
224                    90.9091,  89.8876,  88.8889,  88.8889,  88.3978,  87.4317,  86.0215,
225                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
226                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
227                    86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,  86.0215,
228                    86.0215,  86.0215,  86.0215,  86.0215,  86.4865,  86.4865,  86.4865,
229                    86.4865,  86.4865,  87.4317,  87.9121,  87.9121,  87.9121,  89.8876,
230                    90.9091,  90.9091,  90.9091,  90.9091,  90.9091,  91.4286,  91.4286,
231                    91.4286,  92.4855,  92.4855,  93.0233,  93.0233,  93.0233,  93.5673,
232                    93.5673,  95.2381,  95.2381, 100.0000, 101.9108, 112.6761, 112.6761,
233                112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841,
234                130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813,
235                130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843,
236                125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841,
237                130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538,
238                134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932,
239                135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929,
240                144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929,
241                141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000,
242                160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485,
243                164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485,
244                164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398,
245                155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481,
246                148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
247                148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
248                146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462,
249                153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462,
250                153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462,
251                153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810,
252                152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481,
253                148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545,
254                145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571,
255                142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441,
256                145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890,
257                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
258                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545,
259                145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545,
260                146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890,
261                400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485,
262                484.8485, 484.8485, 484.8485, 484.8485, 484.8485]),
263        'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02,  ..., 0.0000e+00, 0.0000e+00,
264                    0.0000e+00],
265                [3.0977e-04, 1.5698e-02, 1.5785e-02,  ..., 0.0000e+00, 0.0000e+00,
266                    0.0000e+00],
267                [8.2318e-05, 1.4367e-02, 2.8095e-01,  ..., 0.0000e+00, 0.0000e+00,
268                    0.0000e+00],
269                ...,
270                [3.6322e-05, 9.7330e-03, 5.4812e-02,  ..., 0.0000e+00, 0.0000e+00,
271                    0.0000e+00],
272                [2.2802e-05, 1.2481e-02, 5.8374e-02,  ..., 0.0000e+00, 0.0000e+00,
273                    0.0000e+00],
274                [5.3029e-05, 3.1305e-02, 7.9842e-02,  ..., 0.0000e+00, 0.0000e+00,
275                    0.0000e+00]]),
276        'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02,  ..., -6.3893e+02,
277                    -6.3893e+02, -6.3893e+02],
278                [ 1.3593e+01,  1.9928e+01,  2.6022e+01,  ...,  3.9824e-05,
279                    3.9824e-05,  3.9824e-05],
280                [ 7.3933e+00, -2.1680e+01, -1.4259e+01,  ..., -1.3440e-05,
281                    -1.3440e-05, -1.3440e-05],
282                ...,
283                [ 1.8122e+00, -3.1072e+00, -3.7336e+00,  ...,  7.0669e-05,
284                    7.0669e-05,  7.0669e-05],
285                [-2.7518e-01, -9.4738e+00, -2.3157e+00,  ..., -1.7963e-04,
286                    -1.7963e-04, -1.7963e-04],
287                [ 2.3144e-01, -6.4129e+00, -8.4420e+00,  ..., -1.5891e-04,
288                    -1.5891e-04, -1.5891e-04]]),
289        'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02,  ..., 0.0000e+00, 0.0000e+00,
290                    0.0000e+00],
291                [3.0977e-04, 1.5698e-02, 1.5785e-02,  ..., 0.0000e+00, 0.0000e+00,
292                    0.0000e+00],
293                [8.2318e-05, 1.4367e-02, 2.8095e-01,  ..., 0.0000e+00, 0.0000e+00,
294                    0.0000e+00],
295                ...,
296                [3.6322e-05, 9.7330e-03, 5.4812e-02,  ..., 0.0000e+00, 0.0000e+00,
297                    0.0000e+00],
298                [2.2802e-05, 1.2481e-02, 5.8374e-02,  ..., 0.0000e+00, 0.0000e+00,
299                    0.0000e+00],
300                [5.3029e-05, 3.1305e-02, 7.9842e-02,  ..., 0.0000e+00, 0.0000e+00,
301                    0.0000e+00]]),
302        'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02,  ..., 0.0000e+00, 0.0000e+00,
303                    0.0000e+00],
304                [5.0707e-04, 1.1670e-02, 1.5016e-02,  ..., 0.0000e+00, 0.0000e+00,
305                    0.0000e+00],
306                [3.1901e-04, 1.8529e-02, 1.8078e-02,  ..., 0.0000e+00, 0.0000e+00,
307                    0.0000e+00],
308                ...,
309                [1.0302e-05, 3.5917e-03, 2.7169e-03,  ..., 0.0000e+00, 0.0000e+00,
310                    0.0000e+00],
311                [9.6637e-08, 1.3364e-03, 1.8495e-02,  ..., 0.0000e+00, 0.0000e+00,
312                    0.0000e+00],
313                [1.4414e-05, 1.0598e-04, 2.8004e-02,  ..., 0.0000e+00, 0.0000e+00,
314                    0.0000e+00]])},
315        'parselmouth': ({'duration': 5.1613125,
316            'speaking_rate': 3.874983349680919,
317            'articulation_rate': 3.874983349680919,
318            'phonation_ratio': 1.0,
319            'pause_rate': 0.0,
320            'mean_pause_duration': 0.0,
321            'mean_f0_hertz': 118.59917806814313,
322            'std_f0_hertz': 30.232960797931817,
323            'mean_intensity_db': 69.76277128148347,
324            'std_intensity_db': 58.54414165935646,
325            'range_ratio_intensity_db': -0.25736445047981316,
326            'pitch_floor': 60.0,
327            'pitch_ceiling': 250.0,
328            'mean_hnr_db': 3.3285614070654375,
329            'std_hnr_db': 3.36490968797237,
330            'spectral_slope': -13.982306776816046,
331            'spectral_tilt': -0.004414961849917737,
332            'cepstral_peak_prominence_mean': 7.0388038514346825,
333            'cepstral_peak_prominence_std': 1.5672438573255245,
334            'mean_f1_loc': 613.4664268420964,
335            'std_f1_loc': 303.98235579059883,
336            'mean_b1_loc': 401.96960219300837,
337            'std_b1_loc': 400.9001719378358,
338            'mean_f2_loc': 1701.7755281579418,
339            'std_f2_loc': 325.4405394017738,
340            'mean_b2_loc': 434.542188503193,
341            'std_b2_loc': 380.8914612651878,
342            'spectral_gravity': 579.587511962247,
343            'spectral_std_dev': 651.3025011919739,
344            'spectral_skewness': 3.5879707548251045,
345            'spectral_kurtosis': 19.991495997865282,
346            'local_jitter': 0.02553484151620524,
347            'localabsolute_jitter': 0.00021392842618599855,
348            'rap_jitter': 0.012174051087556429,
349            'ppq5_jitter': 0.01597797849248675,
350            'ddp_jitter': 0.03652215326266929,
351            'local_shimmer': 0.1530474665829716,
352            'localDB_shimmer': 1.3511061323188314,
353            'apq3_shimmer': 0.0702984931637734,
354            'apq5_shimmer': 0.09680154282272849,
355            'apq11_shimmer': 0.19065409516266155,
356            'dda_shimmer': 0.2108954794913202},),
357        'torchaudio_squim': {'stoi': 0.9247563481330872,
358        'pesq': 1.3702949285507202,
359        'si_sdr': 11.71167278289795}}]
360
361    Example (disable OpenSMILE; customize torchaudio):
362        >>> from pathlib import Path
363        >>> from senselab.audio.data_structures import Audio
364        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
365        >>> feats = extract_features_from_audios(
366        ...     [a1],
367        ...     opensmile=False,
368        ...     torchaudio={
369        ...         "n_fft": 2048,
370        ...         "hop_length": 256
371        ...     },
372        ... )
373        >>> "opensmile" in feats[0]
374        False
375
376    Example (Parselmouth only, custom pitch range):
377        >>> from pathlib import Path
378        >>> from senselab.audio.data_structures import Audio
379        >>> a1 = Audio(filepath=Path("sample1.wav").resolve())
380        >>> feats = extract_features_from_audios(
381        ...     [a1],
382        ...     opensmile=False,
383        ...     torchaudio=False,
384        ...     torchaudio_squim=False,
385        ...     parselmouth={"pitch_unit": "Hertz"},
386        ... )
387        >>> "praat_parselmouth" in feats[0]
388        True
389    """
390    if opensmile:
391        default_opensmile: Dict[str, Any] = {"feature_set": "eGeMAPSv02", "feature_level": "Functionals"}
392        if isinstance(opensmile, dict):
393            my_opensmile = {**default_opensmile, **opensmile}
394        else:
395            my_opensmile = default_opensmile
396        opensmile_features = extract_opensmile_features_from_audios(audios, **my_opensmile)  # type: ignore
397    if parselmouth:
398        default_parselmouth: Dict[str, Any] = {
399            "time_step": 0.005,
400            "window_length": 0.025,
401            "pitch_unit": "Hertz",
402            "speech_rate": True,
403            "intensity_descriptors": True,
404            "harmonicity_descriptors": True,
405            "formants": True,
406            "spectral_moments": True,
407            "pitch": True,
408            "slope_tilt": True,
409            "cpp_descriptors": True,
410            "duration": True,
411            "jitter": True,
412            "shimmer": True,
413        }
414        # Update default_parselmouth with provided parselmouth dictionary
415        if isinstance(parselmouth, dict):
416            my_parselmouth = {**default_parselmouth, **parselmouth}
417        else:
418            my_parselmouth = default_parselmouth
419
420        parselmouth_features = extract_praat_parselmouth_features_from_audios(audios=audios, **my_parselmouth)  # type: ignore
421
422    if torchaudio:
423        default_torchaudio: Dict[str, Any] = {
424            "freq_low": 80,
425            "freq_high": 500,
426            "n_fft": 1024,
427            "n_mels": 128,
428            "n_mfcc": 40,
429            "win_length": None,
430            "hop_length": None,
431        }
432        if isinstance(torchaudio, dict):
433            my_torchaudio = {**default_torchaudio, **torchaudio}
434        else:
435            my_torchaudio = default_torchaudio
436
437        torchaudio_features = extract_torchaudio_features_from_audios(audios=audios, **my_torchaudio)  # type: ignore
438    if torchaudio_squim:
439        torchaudio_squim_features = extract_objective_quality_features_from_audios(audios=audios)
440
441    results = []
442    for i in range(len(audios)):
443        result = {}
444        if opensmile:
445            result["opensmile"] = opensmile_features[i]
446        if parselmouth:
447            result["praat_parselmouth"] = parselmouth_features[i]
448        if torchaudio:
449            result["torchaudio"] = torchaudio_features[i]
450        if torchaudio_squim:
451            result["torchaudio_squim"] = torchaudio_squim_features[i]
452        results.append(result)
453
454    return results

Extract multi-backend features for each Audio and return a list of dicts.

Enabled backends run in parallelizable sub-workflows (where applicable) and their outputs are merged per audio. Disable any backend by passing False; customize a backend by passing a dict (see below for keys and defaults).

Arguments:
  • audios (list[Audio]): Input audio objects.
  • opensmile (dict | bool, optional): - False → skip OpenSMILE.
    • True → use defaults: {"feature_set": "eGeMAPSv02", "feature_level": "Functionals"}
    • dict → override any of the above keys. feature_set and feature_level should match OpenSMILE presets.
  • parselmouth (dict | bool, optional): - False → skip Praat/Parselmouth.
    • True → use defaults (pitch, intensity, jitter, shimmer, formants, etc. enabled): {"time_step": 0.005, "window_length": 0.025, "pitch_unit": "Hertz", "speech_rate": True, "intensity_descriptors": True, "harmonicity_descriptors": True, "formants": True, "spectral_moments": True, "pitch": True, "slope_tilt": True, "cpp_descriptors": True, "duration": True, "jitter": True, "shimmer": True}
    • dict → override any of the above keys.
  • torchaudio (dict | bool, optional): - False → skip torchaudio.
    • True → use defaults: {"freq_low": 80, "freq_high": 500, "n_fft": 1024, "n_mels": 128, "n_mfcc": 40, "win_length": None, "hop_length": None}
    • dict → override any of the above keys (e.g., n_fft, hop_length).
  • torchaudio_squim (bool, optional): - False → skip objective quality metrics.
    • True → compute metrics such as STOI, PESQ, SI-SDR (backend-dependent defaults).
Returns:

list[dict[str, Any]]: One dict per input audio. Keys present depend on enabled backends; typical structure:

  • "opensmile"dict[str, float] of aggregated descriptors.
  • "praat_parselmouth"dict[str, float] (prosody/voice-quality).
  • "torchaudio" → nested dict[str, Tensor] (e.g., spectrogram, mel_spectrogram, mfcc, pitch). Tensors have shapes defined by your STFT/mel/MFCC settings.
  • "torchaudio_squim"dict[str, float] with objective quality scores.
Raises:
  • ModuleNotFoundError: If a requested backend library is not installed (e.g., opensmile, praat-parselmouth, or dependencies required by torchaudio-squim).
  • ValueError: If invalid parameter combinations are provided to a backend.
Tips:
  • Memory: Torchaudio tensors (spectrograms, mels) can be large. Convert or downsample if you only need summary stats.

Example (all defaults):

from pathlib import Path from senselab.audio.data_structures import Audio a1 = Audio(filepath=Path("sample1.wav").resolve()) feats = extract_features_from_audios([a1]) sorted(feats[0].keys()) ['opensmile', 'praat_parselmouth', 'torchaudio', 'torchaudio_squim']

Example (all defaults II):

from senselab.audio.data_structures import Audio from senselab.audio.tasks.features_extraction import extract_features_from_audios from pathlib import Path a1 = Audio(filepath=Path("sample1.wav").resolve()) extract_features_from_audios([a1]) [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172, 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103, 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266, 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084, 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344, 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078, 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578, 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047, 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945, 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227, 'loudness_sma3_amean': 0.86087566614151, 'loudness_sma3_stddevNorm': 0.43875235319137573, 'loudness_sma3_percentile20.0': 0.5877408981323242, 'loudness_sma3_percentile50.0': 0.8352401852607727, 'loudness_sma3_percentile80.0': 1.1747918128967285, 'loudness_sma3_pctlrange0-2': 0.5870509147644043, 'loudness_sma3_meanRisingSlope': 10.285204887390137, 'loudness_sma3_stddevRisingSlope': 7.544795513153076, 'loudness_sma3_meanFallingSlope': 7.612527370452881, 'loudness_sma3_stddevFallingSlope': 4.15903902053833, 'spectralFlux_sma3_amean': 0.3213598430156708, 'spectralFlux_sma3_stddevNorm': 0.6921582818031311, 'mfcc1_sma3_amean': 10.274803161621094, 'mfcc1_sma3_stddevNorm': 1.1581648588180542, 'mfcc2_sma3_amean': 4.262020111083984, 'mfcc2_sma3_stddevNorm': 2.052302837371826, 'mfcc3_sma3_amean': 7.624598026275635, 'mfcc3_sma3_stddevNorm': 1.4570358991622925, 'mfcc4_sma3_amean': 3.6676177978515625, 'mfcc4_sma3_stddevNorm': 2.6902272701263428, 'jitterLocal_sma3nz_amean': 0.019597552716732025, 'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774, 'shimmerLocaldB_sma3nz_amean': 1.264746069908142, 'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757, 'HNRdBACF_sma3nz_amean': 3.6400067806243896, 'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499, 'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274, 'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456, 'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812, 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816, 'F1frequency_sma3nz_amean': 665.1713256835938, 'F1frequency_sma3nz_stddevNorm': 0.41958823800086975, 'F1bandwidth_sma3nz_amean': 1300.2757568359375, 'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802, 'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672, 'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125, 'F2frequency_sma3nz_amean': 1657.013916015625, 'F2frequency_sma3nz_stddevNorm': 0.17019854485988617, 'F2bandwidth_sma3nz_amean': 1105.7457275390625, 'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774, 'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094, 'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417, 'F3frequency_sma3nz_amean': 2601.6630859375, 'F3frequency_sma3nz_stddevNorm': 0.11457356810569763, 'F3bandwidth_sma3nz_amean': 1091.15087890625, 'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872, 'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625, 'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729, 'alphaRatioV_sma3nz_amean': -8.626543045043945, 'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326, 'hammarbergIndexV_sma3nz_amean': 16.796842575073242, 'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323, 'slopeV0-500_sma3nz_amean': 0.021949246525764465, 'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826, 'slopeV500-1500_sma3nz_amean': -0.008139753714203835, 'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528, 'spectralFluxV_sma3nz_amean': 0.4831695556640625, 'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483, 'mfcc1V_sma3nz_amean': 20.25444793701172, 'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944, 'mfcc2V_sma3nz_amean': 3.619405746459961, 'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438, 'mfcc3V_sma3nz_amean': 7.736487865447998, 'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037, 'mfcc4V_sma3nz_amean': 4.60503625869751, 'mfcc4V_sma3nz_stddevNorm': 2.864668846130371, 'alphaRatioUV_sma3nz_amean': -2.5990121364593506, 'hammarbergIndexUV_sma3nz_amean': 8.862899780273438, 'slopeUV0-500_sma3nz_amean': 0.002166695659980178, 'slopeUV500-1500_sma3nz_amean': 0.006735736038535833, 'spectralFluxUV_sma3nz_amean': 0.24703539907932281, 'loudnessPeaksPerSec': 3.8834950923919678, 'VoicedSegmentsPerSec': 2.745098114013672, 'MeanVoicedSegmentLengthSec': 0.12214285880327225, 'StddevVoicedSegmentLengthSec': 0.09025190770626068, 'MeanUnvoicedSegmentLength': 0.20666664838790894, 'StddevUnvoicedSegmentLength': 0.17666037380695343, 'equivalentSoundLevel_dBp': -24.297256469726562}, 'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963, 140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448, 110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752, 124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603, 106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752, 101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000, 98.1595, 98.1595, 98.1595, 95.8084, 95.8084, 95.8084, 95.2381, 95.2381, 94.6746, 91.9540, 91.9540, 91.9540, 91.9540, 91.9540, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.9540, 91.9540, 93.0233, 93.5673, 93.5673, 94.1176, 94.6746, 94.6746, 94.6746, 95.8084, 96.3855, 96.9697, 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752, 104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883, 116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769, 122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471, 117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769, 123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752, 106.6667, 103.2258, 103.2258, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 97.5610, 97.5610, 97.5610, 97.5610, 97.5610, 98.1595, 100.0000, 100.6289, 100.6289, 100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 97.5610, 90.9091, 89.8876, 88.8889, 88.8889, 88.3978, 87.4317, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.4865, 86.4865, 86.4865, 86.4865, 86.4865, 87.4317, 87.9121, 87.9121, 87.9121, 89.8876, 90.9091, 90.9091, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 92.4855, 92.4855, 93.0233, 93.0233, 93.0233, 93.5673, 93.5673, 95.2381, 95.2381, 100.0000, 101.9108, 112.6761, 112.6761, 112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841, 130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813, 130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843, 125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841, 130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538, 134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932, 135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929, 144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929, 141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000, 160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398, 155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810, 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441, 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485]), 'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]]), 'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02, ..., -6.3893e+02, -6.3893e+02, -6.3893e+02], [ 1.3593e+01, 1.9928e+01, 2.6022e+01, ..., 3.9824e-05, 3.9824e-05, 3.9824e-05], [ 7.3933e+00, -2.1680e+01, -1.4259e+01, ..., -1.3440e-05, -1.3440e-05, -1.3440e-05], ..., [ 1.8122e+00, -3.1072e+00, -3.7336e+00, ..., 7.0669e-05, 7.0669e-05, 7.0669e-05], [-2.7518e-01, -9.4738e+00, -2.3157e+00, ..., -1.7963e-04, -1.7963e-04, -1.7963e-04], [ 2.3144e-01, -6.4129e+00, -8.4420e+00, ..., -1.5891e-04, -1.5891e-04, -1.5891e-04]]), 'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]]), 'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.0707e-04, 1.1670e-02, 1.5016e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.1901e-04, 1.8529e-02, 1.8078e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [1.0302e-05, 3.5917e-03, 2.7169e-03, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [9.6637e-08, 1.3364e-03, 1.8495e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.4414e-05, 1.0598e-04, 2.8004e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]])}, 'parselmouth': ({'duration': 5.1613125, 'speaking_rate': 3.874983349680919, 'articulation_rate': 3.874983349680919, 'phonation_ratio': 1.0, 'pause_rate': 0.0, 'mean_pause_duration': 0.0, 'mean_f0_hertz': 118.59917806814313, 'std_f0_hertz': 30.232960797931817, 'mean_intensity_db': 69.76277128148347, 'std_intensity_db': 58.54414165935646, 'range_ratio_intensity_db': -0.25736445047981316, 'pitch_floor': 60.0, 'pitch_ceiling': 250.0, 'mean_hnr_db': 3.3285614070654375, 'std_hnr_db': 3.36490968797237, 'spectral_slope': -13.982306776816046, 'spectral_tilt': -0.004414961849917737, 'cepstral_peak_prominence_mean': 7.0388038514346825, 'cepstral_peak_prominence_std': 1.5672438573255245, 'mean_f1_loc': 613.4664268420964, 'std_f1_loc': 303.98235579059883, 'mean_b1_loc': 401.96960219300837, 'std_b1_loc': 400.9001719378358, 'mean_f2_loc': 1701.7755281579418, 'std_f2_loc': 325.4405394017738, 'mean_b2_loc': 434.542188503193, 'std_b2_loc': 380.8914612651878, 'spectral_gravity': 579.587511962247, 'spectral_std_dev': 651.3025011919739, 'spectral_skewness': 3.5879707548251045, 'spectral_kurtosis': 19.991495997865282, 'local_jitter': 0.02553484151620524, 'localabsolute_jitter': 0.00021392842618599855, 'rap_jitter': 0.012174051087556429, 'ppq5_jitter': 0.01597797849248675, 'ddp_jitter': 0.03652215326266929, 'local_shimmer': 0.1530474665829716, 'localDB_shimmer': 1.3511061323188314, 'apq3_shimmer': 0.0702984931637734, 'apq5_shimmer': 0.09680154282272849, 'apq11_shimmer': 0.19065409516266155, 'dda_shimmer': 0.2108954794913202},), 'torchaudio_squim': {'stoi': 0.9247563481330872, 'pesq': 1.3702949285507202, 'si_sdr': 11.71167278289795}}]

Example (disable OpenSMILE; customize torchaudio):

from pathlib import Path from senselab.audio.data_structures import Audio a1 = Audio(filepath=Path("sample1.wav").resolve()) feats = extract_features_from_audios( ... [a1], ... opensmile=False, ... torchaudio={ ... "n_fft": 2048, ... "hop_length": 256 ... }, ... ) "opensmile" in feats[0] False

Example (Parselmouth only, custom pitch range):

from pathlib import Path from senselab.audio.data_structures import Audio a1 = Audio(filepath=Path("sample1.wav").resolve()) feats = extract_features_from_audios( ... [a1], ... opensmile=False, ... torchaudio=False, ... torchaudio_squim=False, ... parselmouth={"pitch_unit": "Hertz"}, ... ) "praat_parselmouth" in feats[0] True