senselab.audio.tasks.features_extraction.api
This module provides functions to describe audio files by extracting features.
The extracted descriptors represent dimensions within a multi-dimensional space, allowing for a detailed analysis of each file's characteristics. These dimensions can be updated or refined in the future as part of ongoing development efforts. This module is currently a work in progress.
1"""This module provides functions to describe audio files by extracting features. 2 3The extracted descriptors represent dimensions within a multi-dimensional space, 4allowing for a detailed analysis of each file's characteristics. 5These dimensions can be updated or refined in the future as part of ongoing development efforts. 6This module is currently a work in progress. 7""" 8 9from typing import Any, Dict, List, Union 10 11from senselab.audio.data_structures import Audio 12 13from .opensmile import extract_opensmile_features_from_audios 14from .praat_parselmouth import extract_praat_parselmouth_features_from_audios 15from .torchaudio import extract_torchaudio_features_from_audios 16from .torchaudio_squim import extract_objective_quality_features_from_audios 17 18 19def extract_features_from_audios( 20 audios: List[Audio], 21 opensmile: Union[Dict[str, str], bool] = True, 22 parselmouth: Union[Dict[str, str], bool] = True, 23 torchaudio: Union[Dict[str, str], bool] = True, 24 torchaudio_squim: bool = True, 25) -> List[Dict[str, Any]]: 26 """Extract features from a list of audio objects. 27 28 Args: 29 audios (List[Audio]): The list of audio objects to extract features from. 30 opensmile (Union[Dict[str, str], bool]): Parameters for OpenSMILE feature extraction. 31 If False, OpenSMILE features will not be extracted. If True, uses default OpenSMILE parameters. 32 If a dictionary, should contain "feature_set" and "feature_level" keys. 33 parselmouth (Union[Dict[str, str], bool]): Parameters for Praat Parselmouth feature extraction. 34 If False, Praat Parselmouth features will not be extracted. 35 If True, uses default Praat Parselmouth parameters. 36 If a dictionary, should contain "time_step", "window_length", "pitch_unit", "cache_dir", 37 "speech_rate", "intensity_descriptors", "harmonicity_descriptors", "formants", "spectral_moments", "pitch", 38 "slope_tilt", "cpp_descriptors", "duration", "jitter", "shimmer" keys. 39 torchaudio (Union[Dict[str, str], bool]): Parameters for torchaudio feature extraction. 40 If False, torchaudio features will not be extracted. If True, uses default torchaudio parameters. 41 If a dictionary, should contain "freq_low", "freq_high", "n_fft", "n_mels", "n_mfcc", 42 "win_length" and "hop_length" keys. 43 torchaudio_squim (bool): Parameters for torchaudio_squim feature extraction. 44 If False, torchaudio_squim features will not be extracted. 45 46 Returns: 47 List[Dict[str, Any]]: The list of feature dictionaries for each audio. 48 49 Examples: 50 >>> extract_features_from_audios(audios) 51 [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172, 52 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103, 53 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266, 54 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084, 55 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344, 56 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078, 57 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578, 58 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047, 59 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945, 60 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227, 61 'loudness_sma3_amean': 0.86087566614151, 62 'loudness_sma3_stddevNorm': 0.43875235319137573, 63 'loudness_sma3_percentile20.0': 0.5877408981323242, 64 'loudness_sma3_percentile50.0': 0.8352401852607727, 65 'loudness_sma3_percentile80.0': 1.1747918128967285, 66 'loudness_sma3_pctlrange0-2': 0.5870509147644043, 67 'loudness_sma3_meanRisingSlope': 10.285204887390137, 68 'loudness_sma3_stddevRisingSlope': 7.544795513153076, 69 'loudness_sma3_meanFallingSlope': 7.612527370452881, 70 'loudness_sma3_stddevFallingSlope': 4.15903902053833, 71 'spectralFlux_sma3_amean': 0.3213598430156708, 72 'spectralFlux_sma3_stddevNorm': 0.6921582818031311, 73 'mfcc1_sma3_amean': 10.274803161621094, 74 'mfcc1_sma3_stddevNorm': 1.1581648588180542, 75 'mfcc2_sma3_amean': 4.262020111083984, 76 'mfcc2_sma3_stddevNorm': 2.052302837371826, 77 'mfcc3_sma3_amean': 7.624598026275635, 78 'mfcc3_sma3_stddevNorm': 1.4570358991622925, 79 'mfcc4_sma3_amean': 3.6676177978515625, 80 'mfcc4_sma3_stddevNorm': 2.6902272701263428, 81 'jitterLocal_sma3nz_amean': 0.019597552716732025, 82 'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774, 83 'shimmerLocaldB_sma3nz_amean': 1.264746069908142, 84 'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757, 85 'HNRdBACF_sma3nz_amean': 3.6400067806243896, 86 'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499, 87 'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274, 88 'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456, 89 'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812, 90 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816, 91 'F1frequency_sma3nz_amean': 665.1713256835938, 92 'F1frequency_sma3nz_stddevNorm': 0.41958823800086975, 93 'F1bandwidth_sma3nz_amean': 1300.2757568359375, 94 'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802, 95 'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672, 96 'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125, 97 'F2frequency_sma3nz_amean': 1657.013916015625, 98 'F2frequency_sma3nz_stddevNorm': 0.17019854485988617, 99 'F2bandwidth_sma3nz_amean': 1105.7457275390625, 100 'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774, 101 'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094, 102 'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417, 103 'F3frequency_sma3nz_amean': 2601.6630859375, 104 'F3frequency_sma3nz_stddevNorm': 0.11457356810569763, 105 'F3bandwidth_sma3nz_amean': 1091.15087890625, 106 'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872, 107 'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625, 108 'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729, 109 'alphaRatioV_sma3nz_amean': -8.626543045043945, 110 'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326, 111 'hammarbergIndexV_sma3nz_amean': 16.796842575073242, 112 'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323, 113 'slopeV0-500_sma3nz_amean': 0.021949246525764465, 114 'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826, 115 'slopeV500-1500_sma3nz_amean': -0.008139753714203835, 116 'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528, 117 'spectralFluxV_sma3nz_amean': 0.4831695556640625, 118 'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483, 119 'mfcc1V_sma3nz_amean': 20.25444793701172, 120 'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944, 121 'mfcc2V_sma3nz_amean': 3.619405746459961, 122 'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438, 123 'mfcc3V_sma3nz_amean': 7.736487865447998, 124 'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037, 125 'mfcc4V_sma3nz_amean': 4.60503625869751, 126 'mfcc4V_sma3nz_stddevNorm': 2.864668846130371, 127 'alphaRatioUV_sma3nz_amean': -2.5990121364593506, 128 'hammarbergIndexUV_sma3nz_amean': 8.862899780273438, 129 'slopeUV0-500_sma3nz_amean': 0.002166695659980178, 130 'slopeUV500-1500_sma3nz_amean': 0.006735736038535833, 131 'spectralFluxUV_sma3nz_amean': 0.24703539907932281, 132 'loudnessPeaksPerSec': 3.8834950923919678, 133 'VoicedSegmentsPerSec': 2.745098114013672, 134 'MeanVoicedSegmentLengthSec': 0.12214285880327225, 135 'StddevVoicedSegmentLengthSec': 0.09025190770626068, 136 'MeanUnvoicedSegmentLength': 0.20666664838790894, 137 'StddevUnvoicedSegmentLength': 0.17666037380695343, 138 'equivalentSoundLevel_dBp': -24.297256469726562}, 139 'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963, 140 140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448, 141 110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752, 142 124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603, 143 106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752, 144 101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000, 98.1595, 145 98.1595, 98.1595, 95.8084, 95.8084, 95.8084, 95.2381, 95.2381, 146 94.6746, 91.9540, 91.9540, 91.9540, 91.9540, 91.9540, 91.4286, 147 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 90.9091, 148 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 149 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 150 91.4286, 91.9540, 91.9540, 93.0233, 93.5673, 93.5673, 94.1176, 151 94.6746, 94.6746, 94.6746, 95.8084, 96.3855, 96.9697, 100.0000, 152 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752, 153 104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883, 154 116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 155 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 156 123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769, 157 122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471, 158 117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632, 159 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 160 108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 161 105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121, 162 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 163 122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769, 164 123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752, 165 106.6667, 103.2258, 103.2258, 96.9697, 96.9697, 96.9697, 96.9697, 166 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 167 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 168 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 97.5610, 97.5610, 169 97.5610, 97.5610, 97.5610, 98.1595, 100.0000, 100.6289, 100.6289, 170 100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 171 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 97.5610, 172 90.9091, 89.8876, 88.8889, 88.8889, 88.3978, 87.4317, 86.0215, 173 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 174 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 175 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 176 86.0215, 86.0215, 86.0215, 86.0215, 86.4865, 86.4865, 86.4865, 177 86.4865, 86.4865, 87.4317, 87.9121, 87.9121, 87.9121, 89.8876, 178 90.9091, 90.9091, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 179 91.4286, 92.4855, 92.4855, 93.0233, 93.0233, 93.0233, 93.5673, 180 93.5673, 95.2381, 95.2381, 100.0000, 101.9108, 112.6761, 112.6761, 181 112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841, 182 130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813, 183 130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843, 184 125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841, 185 130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538, 186 134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932, 187 135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929, 188 144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929, 189 141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000, 190 160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485, 191 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 192 164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398, 193 155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481, 194 148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 195 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 196 146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462, 197 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 198 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 199 153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810, 200 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 201 148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545, 202 145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571, 203 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441, 204 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 205 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 206 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 207 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 208 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 209 400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 210 484.8485, 484.8485, 484.8485, 484.8485, 484.8485]), 211 'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 212 0.0000e+00], 213 [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 214 0.0000e+00], 215 [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 216 0.0000e+00], 217 ..., 218 [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 219 0.0000e+00], 220 [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 221 0.0000e+00], 222 [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 223 0.0000e+00]]), 224 'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02, ..., -6.3893e+02, 225 -6.3893e+02, -6.3893e+02], 226 [ 1.3593e+01, 1.9928e+01, 2.6022e+01, ..., 3.9824e-05, 227 3.9824e-05, 3.9824e-05], 228 [ 7.3933e+00, -2.1680e+01, -1.4259e+01, ..., -1.3440e-05, 229 -1.3440e-05, -1.3440e-05], 230 ..., 231 [ 1.8122e+00, -3.1072e+00, -3.7336e+00, ..., 7.0669e-05, 232 7.0669e-05, 7.0669e-05], 233 [-2.7518e-01, -9.4738e+00, -2.3157e+00, ..., -1.7963e-04, 234 -1.7963e-04, -1.7963e-04], 235 [ 2.3144e-01, -6.4129e+00, -8.4420e+00, ..., -1.5891e-04, 236 -1.5891e-04, -1.5891e-04]]), 237 'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 238 0.0000e+00], 239 [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 240 0.0000e+00], 241 [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 242 0.0000e+00], 243 ..., 244 [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 245 0.0000e+00], 246 [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 247 0.0000e+00], 248 [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 249 0.0000e+00]]), 250 'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02, ..., 0.0000e+00, 0.0000e+00, 251 0.0000e+00], 252 [5.0707e-04, 1.1670e-02, 1.5016e-02, ..., 0.0000e+00, 0.0000e+00, 253 0.0000e+00], 254 [3.1901e-04, 1.8529e-02, 1.8078e-02, ..., 0.0000e+00, 0.0000e+00, 255 0.0000e+00], 256 ..., 257 [1.0302e-05, 3.5917e-03, 2.7169e-03, ..., 0.0000e+00, 0.0000e+00, 258 0.0000e+00], 259 [9.6637e-08, 1.3364e-03, 1.8495e-02, ..., 0.0000e+00, 0.0000e+00, 260 0.0000e+00], 261 [1.4414e-05, 1.0598e-04, 2.8004e-02, ..., 0.0000e+00, 0.0000e+00, 262 0.0000e+00]])}, 263 'parselmouth': ({'duration': 5.1613125, 264 'speaking_rate': 3.874983349680919, 265 'articulation_rate': 3.874983349680919, 266 'phonation_ratio': 1.0, 267 'pause_rate': 0.0, 268 'mean_pause_duration': 0.0, 269 'mean_f0_hertz': 118.59917806814313, 270 'std_f0_hertz': 30.232960797931817, 271 'mean_intensity_db': 69.76277128148347, 272 'std_intensity_db': 58.54414165935646, 273 'range_ratio_intensity_db': -0.25736445047981316, 274 'pitch_floor': 60.0, 275 'pitch_ceiling': 250.0, 276 'mean_hnr_db': 3.3285614070654375, 277 'std_hnr_db': 3.36490968797237, 278 'spectral_slope': -13.982306776816046, 279 'spectral_tilt': -0.004414961849917737, 280 'cepstral_peak_prominence_mean': 7.0388038514346825, 281 'cepstral_peak_prominence_std': 1.5672438573255245, 282 'mean_f1_loc': 613.4664268420964, 283 'std_f1_loc': 303.98235579059883, 284 'mean_b1_loc': 401.96960219300837, 285 'std_b1_loc': 400.9001719378358, 286 'mean_f2_loc': 1701.7755281579418, 287 'std_f2_loc': 325.4405394017738, 288 'mean_b2_loc': 434.542188503193, 289 'std_b2_loc': 380.8914612651878, 290 'spectral_gravity': 579.587511962247, 291 'spectral_std_dev': 651.3025011919739, 292 'spectral_skewness': 3.5879707548251045, 293 'spectral_kurtosis': 19.991495997865282, 294 'local_jitter': 0.02553484151620524, 295 'localabsolute_jitter': 0.00021392842618599855, 296 'rap_jitter': 0.012174051087556429, 297 'ppq5_jitter': 0.01597797849248675, 298 'ddp_jitter': 0.03652215326266929, 299 'local_shimmer': 0.1530474665829716, 300 'localDB_shimmer': 1.3511061323188314, 301 'apq3_shimmer': 0.0702984931637734, 302 'apq5_shimmer': 0.09680154282272849, 303 'apq11_shimmer': 0.19065409516266155, 304 'dda_shimmer': 0.2108954794913202},), 305 'torchaudio_squim': {'stoi': 0.9247563481330872, 306 'pesq': 1.3702949285507202, 307 'si_sdr': 11.71167278289795}}] 308 """ 309 if opensmile: 310 default_opensmile = { 311 "feature_set": "eGeMAPSv02", 312 "feature_level": "Functionals", 313 "plugin": "serial", 314 "plugin_args": {}, 315 "cache_dir": None, 316 } 317 if isinstance(opensmile, dict): 318 my_opensmile = {**default_opensmile, **opensmile} 319 else: 320 my_opensmile = default_opensmile 321 opensmile_features = extract_opensmile_features_from_audios(audios, **my_opensmile) # type: ignore 322 if parselmouth: 323 default_parselmouth = { 324 "time_step": 0.005, 325 "window_length": 0.025, 326 "pitch_unit": "Hertz", 327 "cache_dir": None, 328 "speech_rate": True, 329 "intensity_descriptors": True, 330 "harmonicity_descriptors": True, 331 "formants": True, 332 "spectral_moments": True, 333 "pitch": True, 334 "slope_tilt": True, 335 "cpp_descriptors": True, 336 "duration": True, 337 "jitter": True, 338 "shimmer": True, 339 "plugin": "serial", 340 "plugin_args": {}, 341 } 342 # Update default_parselmouth with provided parselmouth dictionary 343 if isinstance(parselmouth, dict): 344 my_parselmouth = {**default_parselmouth, **parselmouth} 345 else: 346 my_parselmouth = default_parselmouth 347 348 parselmouth_features = extract_praat_parselmouth_features_from_audios(audios=audios, **my_parselmouth) # type: ignore 349 350 if torchaudio: 351 default_torchaudio: Dict[str, Any] = { 352 "freq_low": 80, 353 "freq_high": 500, 354 "n_fft": 1024, 355 "n_mels": 128, 356 "n_mfcc": 40, 357 "win_length": None, 358 "hop_length": None, 359 "plugin": "serial", 360 "plugin_args": {}, 361 "cache_dir": None, 362 } 363 if isinstance(torchaudio, dict): 364 my_torchaudio = {**default_torchaudio, **torchaudio} 365 else: 366 my_torchaudio = default_torchaudio 367 368 torchaudio_features = extract_torchaudio_features_from_audios(audios=audios, **my_torchaudio) # type: ignore 369 if torchaudio_squim: 370 torchaudio_squim_features = extract_objective_quality_features_from_audios(audios=audios) 371 372 results = [] 373 for i in range(len(audios)): 374 result = {} 375 if opensmile: 376 result["opensmile"] = opensmile_features[i] 377 if parselmouth: 378 result["praat_parselmouth"] = parselmouth_features[i] 379 if torchaudio: 380 result["torchaudio"] = torchaudio_features[i] 381 if torchaudio_squim: 382 result["torchaudio_squim"] = torchaudio_squim_features[i] 383 results.append(result) 384 385 return results
def
extract_features_from_audios( audios: List[senselab.audio.data_structures.audio.Audio], opensmile: Union[Dict[str, str], bool] = True, parselmouth: Union[Dict[str, str], bool] = True, torchaudio: Union[Dict[str, str], bool] = True, torchaudio_squim: bool = True) -> List[Dict[str, Any]]:
20def extract_features_from_audios( 21 audios: List[Audio], 22 opensmile: Union[Dict[str, str], bool] = True, 23 parselmouth: Union[Dict[str, str], bool] = True, 24 torchaudio: Union[Dict[str, str], bool] = True, 25 torchaudio_squim: bool = True, 26) -> List[Dict[str, Any]]: 27 """Extract features from a list of audio objects. 28 29 Args: 30 audios (List[Audio]): The list of audio objects to extract features from. 31 opensmile (Union[Dict[str, str], bool]): Parameters for OpenSMILE feature extraction. 32 If False, OpenSMILE features will not be extracted. If True, uses default OpenSMILE parameters. 33 If a dictionary, should contain "feature_set" and "feature_level" keys. 34 parselmouth (Union[Dict[str, str], bool]): Parameters for Praat Parselmouth feature extraction. 35 If False, Praat Parselmouth features will not be extracted. 36 If True, uses default Praat Parselmouth parameters. 37 If a dictionary, should contain "time_step", "window_length", "pitch_unit", "cache_dir", 38 "speech_rate", "intensity_descriptors", "harmonicity_descriptors", "formants", "spectral_moments", "pitch", 39 "slope_tilt", "cpp_descriptors", "duration", "jitter", "shimmer" keys. 40 torchaudio (Union[Dict[str, str], bool]): Parameters for torchaudio feature extraction. 41 If False, torchaudio features will not be extracted. If True, uses default torchaudio parameters. 42 If a dictionary, should contain "freq_low", "freq_high", "n_fft", "n_mels", "n_mfcc", 43 "win_length" and "hop_length" keys. 44 torchaudio_squim (bool): Parameters for torchaudio_squim feature extraction. 45 If False, torchaudio_squim features will not be extracted. 46 47 Returns: 48 List[Dict[str, Any]]: The list of feature dictionaries for each audio. 49 50 Examples: 51 >>> extract_features_from_audios(audios) 52 [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172, 53 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103, 54 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266, 55 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084, 56 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344, 57 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078, 58 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578, 59 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047, 60 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945, 61 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227, 62 'loudness_sma3_amean': 0.86087566614151, 63 'loudness_sma3_stddevNorm': 0.43875235319137573, 64 'loudness_sma3_percentile20.0': 0.5877408981323242, 65 'loudness_sma3_percentile50.0': 0.8352401852607727, 66 'loudness_sma3_percentile80.0': 1.1747918128967285, 67 'loudness_sma3_pctlrange0-2': 0.5870509147644043, 68 'loudness_sma3_meanRisingSlope': 10.285204887390137, 69 'loudness_sma3_stddevRisingSlope': 7.544795513153076, 70 'loudness_sma3_meanFallingSlope': 7.612527370452881, 71 'loudness_sma3_stddevFallingSlope': 4.15903902053833, 72 'spectralFlux_sma3_amean': 0.3213598430156708, 73 'spectralFlux_sma3_stddevNorm': 0.6921582818031311, 74 'mfcc1_sma3_amean': 10.274803161621094, 75 'mfcc1_sma3_stddevNorm': 1.1581648588180542, 76 'mfcc2_sma3_amean': 4.262020111083984, 77 'mfcc2_sma3_stddevNorm': 2.052302837371826, 78 'mfcc3_sma3_amean': 7.624598026275635, 79 'mfcc3_sma3_stddevNorm': 1.4570358991622925, 80 'mfcc4_sma3_amean': 3.6676177978515625, 81 'mfcc4_sma3_stddevNorm': 2.6902272701263428, 82 'jitterLocal_sma3nz_amean': 0.019597552716732025, 83 'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774, 84 'shimmerLocaldB_sma3nz_amean': 1.264746069908142, 85 'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757, 86 'HNRdBACF_sma3nz_amean': 3.6400067806243896, 87 'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499, 88 'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274, 89 'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456, 90 'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812, 91 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816, 92 'F1frequency_sma3nz_amean': 665.1713256835938, 93 'F1frequency_sma3nz_stddevNorm': 0.41958823800086975, 94 'F1bandwidth_sma3nz_amean': 1300.2757568359375, 95 'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802, 96 'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672, 97 'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125, 98 'F2frequency_sma3nz_amean': 1657.013916015625, 99 'F2frequency_sma3nz_stddevNorm': 0.17019854485988617, 100 'F2bandwidth_sma3nz_amean': 1105.7457275390625, 101 'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774, 102 'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094, 103 'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417, 104 'F3frequency_sma3nz_amean': 2601.6630859375, 105 'F3frequency_sma3nz_stddevNorm': 0.11457356810569763, 106 'F3bandwidth_sma3nz_amean': 1091.15087890625, 107 'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872, 108 'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625, 109 'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729, 110 'alphaRatioV_sma3nz_amean': -8.626543045043945, 111 'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326, 112 'hammarbergIndexV_sma3nz_amean': 16.796842575073242, 113 'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323, 114 'slopeV0-500_sma3nz_amean': 0.021949246525764465, 115 'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826, 116 'slopeV500-1500_sma3nz_amean': -0.008139753714203835, 117 'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528, 118 'spectralFluxV_sma3nz_amean': 0.4831695556640625, 119 'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483, 120 'mfcc1V_sma3nz_amean': 20.25444793701172, 121 'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944, 122 'mfcc2V_sma3nz_amean': 3.619405746459961, 123 'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438, 124 'mfcc3V_sma3nz_amean': 7.736487865447998, 125 'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037, 126 'mfcc4V_sma3nz_amean': 4.60503625869751, 127 'mfcc4V_sma3nz_stddevNorm': 2.864668846130371, 128 'alphaRatioUV_sma3nz_amean': -2.5990121364593506, 129 'hammarbergIndexUV_sma3nz_amean': 8.862899780273438, 130 'slopeUV0-500_sma3nz_amean': 0.002166695659980178, 131 'slopeUV500-1500_sma3nz_amean': 0.006735736038535833, 132 'spectralFluxUV_sma3nz_amean': 0.24703539907932281, 133 'loudnessPeaksPerSec': 3.8834950923919678, 134 'VoicedSegmentsPerSec': 2.745098114013672, 135 'MeanVoicedSegmentLengthSec': 0.12214285880327225, 136 'StddevVoicedSegmentLengthSec': 0.09025190770626068, 137 'MeanUnvoicedSegmentLength': 0.20666664838790894, 138 'StddevUnvoicedSegmentLength': 0.17666037380695343, 139 'equivalentSoundLevel_dBp': -24.297256469726562}, 140 'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963, 141 140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448, 142 110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752, 143 124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603, 144 106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752, 145 101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000, 98.1595, 146 98.1595, 98.1595, 95.8084, 95.8084, 95.8084, 95.2381, 95.2381, 147 94.6746, 91.9540, 91.9540, 91.9540, 91.9540, 91.9540, 91.4286, 148 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 90.9091, 149 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 150 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 151 91.4286, 91.9540, 91.9540, 93.0233, 93.5673, 93.5673, 94.1176, 152 94.6746, 94.6746, 94.6746, 95.8084, 96.3855, 96.9697, 100.0000, 153 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752, 154 104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883, 155 116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 156 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 157 123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769, 158 122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471, 159 117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632, 160 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 161 108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 162 105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121, 163 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 164 122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769, 165 123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752, 166 106.6667, 103.2258, 103.2258, 96.9697, 96.9697, 96.9697, 96.9697, 167 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 168 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 169 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 97.5610, 97.5610, 170 97.5610, 97.5610, 97.5610, 98.1595, 100.0000, 100.6289, 100.6289, 171 100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 172 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 97.5610, 173 90.9091, 89.8876, 88.8889, 88.8889, 88.3978, 87.4317, 86.0215, 174 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 175 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 176 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 177 86.0215, 86.0215, 86.0215, 86.0215, 86.4865, 86.4865, 86.4865, 178 86.4865, 86.4865, 87.4317, 87.9121, 87.9121, 87.9121, 89.8876, 179 90.9091, 90.9091, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 180 91.4286, 92.4855, 92.4855, 93.0233, 93.0233, 93.0233, 93.5673, 181 93.5673, 95.2381, 95.2381, 100.0000, 101.9108, 112.6761, 112.6761, 182 112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841, 183 130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813, 184 130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843, 185 125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841, 186 130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538, 187 134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932, 188 135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929, 189 144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929, 190 141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000, 191 160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485, 192 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 193 164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398, 194 155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481, 195 148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 196 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 197 146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462, 198 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 199 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 200 153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810, 201 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 202 148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545, 203 145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571, 204 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441, 205 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 206 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 207 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 208 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 209 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 210 400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 211 484.8485, 484.8485, 484.8485, 484.8485, 484.8485]), 212 'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 213 0.0000e+00], 214 [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 215 0.0000e+00], 216 [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 217 0.0000e+00], 218 ..., 219 [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 220 0.0000e+00], 221 [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 222 0.0000e+00], 223 [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 224 0.0000e+00]]), 225 'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02, ..., -6.3893e+02, 226 -6.3893e+02, -6.3893e+02], 227 [ 1.3593e+01, 1.9928e+01, 2.6022e+01, ..., 3.9824e-05, 228 3.9824e-05, 3.9824e-05], 229 [ 7.3933e+00, -2.1680e+01, -1.4259e+01, ..., -1.3440e-05, 230 -1.3440e-05, -1.3440e-05], 231 ..., 232 [ 1.8122e+00, -3.1072e+00, -3.7336e+00, ..., 7.0669e-05, 233 7.0669e-05, 7.0669e-05], 234 [-2.7518e-01, -9.4738e+00, -2.3157e+00, ..., -1.7963e-04, 235 -1.7963e-04, -1.7963e-04], 236 [ 2.3144e-01, -6.4129e+00, -8.4420e+00, ..., -1.5891e-04, 237 -1.5891e-04, -1.5891e-04]]), 238 'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 239 0.0000e+00], 240 [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 241 0.0000e+00], 242 [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 243 0.0000e+00], 244 ..., 245 [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 246 0.0000e+00], 247 [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 248 0.0000e+00], 249 [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 250 0.0000e+00]]), 251 'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02, ..., 0.0000e+00, 0.0000e+00, 252 0.0000e+00], 253 [5.0707e-04, 1.1670e-02, 1.5016e-02, ..., 0.0000e+00, 0.0000e+00, 254 0.0000e+00], 255 [3.1901e-04, 1.8529e-02, 1.8078e-02, ..., 0.0000e+00, 0.0000e+00, 256 0.0000e+00], 257 ..., 258 [1.0302e-05, 3.5917e-03, 2.7169e-03, ..., 0.0000e+00, 0.0000e+00, 259 0.0000e+00], 260 [9.6637e-08, 1.3364e-03, 1.8495e-02, ..., 0.0000e+00, 0.0000e+00, 261 0.0000e+00], 262 [1.4414e-05, 1.0598e-04, 2.8004e-02, ..., 0.0000e+00, 0.0000e+00, 263 0.0000e+00]])}, 264 'parselmouth': ({'duration': 5.1613125, 265 'speaking_rate': 3.874983349680919, 266 'articulation_rate': 3.874983349680919, 267 'phonation_ratio': 1.0, 268 'pause_rate': 0.0, 269 'mean_pause_duration': 0.0, 270 'mean_f0_hertz': 118.59917806814313, 271 'std_f0_hertz': 30.232960797931817, 272 'mean_intensity_db': 69.76277128148347, 273 'std_intensity_db': 58.54414165935646, 274 'range_ratio_intensity_db': -0.25736445047981316, 275 'pitch_floor': 60.0, 276 'pitch_ceiling': 250.0, 277 'mean_hnr_db': 3.3285614070654375, 278 'std_hnr_db': 3.36490968797237, 279 'spectral_slope': -13.982306776816046, 280 'spectral_tilt': -0.004414961849917737, 281 'cepstral_peak_prominence_mean': 7.0388038514346825, 282 'cepstral_peak_prominence_std': 1.5672438573255245, 283 'mean_f1_loc': 613.4664268420964, 284 'std_f1_loc': 303.98235579059883, 285 'mean_b1_loc': 401.96960219300837, 286 'std_b1_loc': 400.9001719378358, 287 'mean_f2_loc': 1701.7755281579418, 288 'std_f2_loc': 325.4405394017738, 289 'mean_b2_loc': 434.542188503193, 290 'std_b2_loc': 380.8914612651878, 291 'spectral_gravity': 579.587511962247, 292 'spectral_std_dev': 651.3025011919739, 293 'spectral_skewness': 3.5879707548251045, 294 'spectral_kurtosis': 19.991495997865282, 295 'local_jitter': 0.02553484151620524, 296 'localabsolute_jitter': 0.00021392842618599855, 297 'rap_jitter': 0.012174051087556429, 298 'ppq5_jitter': 0.01597797849248675, 299 'ddp_jitter': 0.03652215326266929, 300 'local_shimmer': 0.1530474665829716, 301 'localDB_shimmer': 1.3511061323188314, 302 'apq3_shimmer': 0.0702984931637734, 303 'apq5_shimmer': 0.09680154282272849, 304 'apq11_shimmer': 0.19065409516266155, 305 'dda_shimmer': 0.2108954794913202},), 306 'torchaudio_squim': {'stoi': 0.9247563481330872, 307 'pesq': 1.3702949285507202, 308 'si_sdr': 11.71167278289795}}] 309 """ 310 if opensmile: 311 default_opensmile = { 312 "feature_set": "eGeMAPSv02", 313 "feature_level": "Functionals", 314 "plugin": "serial", 315 "plugin_args": {}, 316 "cache_dir": None, 317 } 318 if isinstance(opensmile, dict): 319 my_opensmile = {**default_opensmile, **opensmile} 320 else: 321 my_opensmile = default_opensmile 322 opensmile_features = extract_opensmile_features_from_audios(audios, **my_opensmile) # type: ignore 323 if parselmouth: 324 default_parselmouth = { 325 "time_step": 0.005, 326 "window_length": 0.025, 327 "pitch_unit": "Hertz", 328 "cache_dir": None, 329 "speech_rate": True, 330 "intensity_descriptors": True, 331 "harmonicity_descriptors": True, 332 "formants": True, 333 "spectral_moments": True, 334 "pitch": True, 335 "slope_tilt": True, 336 "cpp_descriptors": True, 337 "duration": True, 338 "jitter": True, 339 "shimmer": True, 340 "plugin": "serial", 341 "plugin_args": {}, 342 } 343 # Update default_parselmouth with provided parselmouth dictionary 344 if isinstance(parselmouth, dict): 345 my_parselmouth = {**default_parselmouth, **parselmouth} 346 else: 347 my_parselmouth = default_parselmouth 348 349 parselmouth_features = extract_praat_parselmouth_features_from_audios(audios=audios, **my_parselmouth) # type: ignore 350 351 if torchaudio: 352 default_torchaudio: Dict[str, Any] = { 353 "freq_low": 80, 354 "freq_high": 500, 355 "n_fft": 1024, 356 "n_mels": 128, 357 "n_mfcc": 40, 358 "win_length": None, 359 "hop_length": None, 360 "plugin": "serial", 361 "plugin_args": {}, 362 "cache_dir": None, 363 } 364 if isinstance(torchaudio, dict): 365 my_torchaudio = {**default_torchaudio, **torchaudio} 366 else: 367 my_torchaudio = default_torchaudio 368 369 torchaudio_features = extract_torchaudio_features_from_audios(audios=audios, **my_torchaudio) # type: ignore 370 if torchaudio_squim: 371 torchaudio_squim_features = extract_objective_quality_features_from_audios(audios=audios) 372 373 results = [] 374 for i in range(len(audios)): 375 result = {} 376 if opensmile: 377 result["opensmile"] = opensmile_features[i] 378 if parselmouth: 379 result["praat_parselmouth"] = parselmouth_features[i] 380 if torchaudio: 381 result["torchaudio"] = torchaudio_features[i] 382 if torchaudio_squim: 383 result["torchaudio_squim"] = torchaudio_squim_features[i] 384 results.append(result) 385 386 return results
Extract features from a list of audio objects.
Arguments:
- audios (List[Audio]): The list of audio objects to extract features from.
- opensmile (Union[Dict[str, str], bool]): Parameters for OpenSMILE feature extraction. If False, OpenSMILE features will not be extracted. If True, uses default OpenSMILE parameters. If a dictionary, should contain "feature_set" and "feature_level" keys.
- parselmouth (Union[Dict[str, str], bool]): Parameters for Praat Parselmouth feature extraction. If False, Praat Parselmouth features will not be extracted. If True, uses default Praat Parselmouth parameters. If a dictionary, should contain "time_step", "window_length", "pitch_unit", "cache_dir", "speech_rate", "intensity_descriptors", "harmonicity_descriptors", "formants", "spectral_moments", "pitch", "slope_tilt", "cpp_descriptors", "duration", "jitter", "shimmer" keys.
- torchaudio (Union[Dict[str, str], bool]): Parameters for torchaudio feature extraction. If False, torchaudio features will not be extracted. If True, uses default torchaudio parameters. If a dictionary, should contain "freq_low", "freq_high", "n_fft", "n_mels", "n_mfcc", "win_length" and "hop_length" keys.
- torchaudio_squim (bool): Parameters for torchaudio_squim feature extraction. If False, torchaudio_squim features will not be extracted.
Returns:
List[Dict[str, Any]]: The list of feature dictionaries for each audio.
Examples:
>>> extract_features_from_audios(audios) [{'opensmile': {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172, 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103, 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266, 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084, 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344, 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078, 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578, 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047, 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945, 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227, 'loudness_sma3_amean': 0.86087566614151, 'loudness_sma3_stddevNorm': 0.43875235319137573, 'loudness_sma3_percentile20.0': 0.5877408981323242, 'loudness_sma3_percentile50.0': 0.8352401852607727, 'loudness_sma3_percentile80.0': 1.1747918128967285, 'loudness_sma3_pctlrange0-2': 0.5870509147644043, 'loudness_sma3_meanRisingSlope': 10.285204887390137, 'loudness_sma3_stddevRisingSlope': 7.544795513153076, 'loudness_sma3_meanFallingSlope': 7.612527370452881, 'loudness_sma3_stddevFallingSlope': 4.15903902053833, 'spectralFlux_sma3_amean': 0.3213598430156708, 'spectralFlux_sma3_stddevNorm': 0.6921582818031311, 'mfcc1_sma3_amean': 10.274803161621094, 'mfcc1_sma3_stddevNorm': 1.1581648588180542, 'mfcc2_sma3_amean': 4.262020111083984, 'mfcc2_sma3_stddevNorm': 2.052302837371826, 'mfcc3_sma3_amean': 7.624598026275635, 'mfcc3_sma3_stddevNorm': 1.4570358991622925, 'mfcc4_sma3_amean': 3.6676177978515625, 'mfcc4_sma3_stddevNorm': 2.6902272701263428, 'jitterLocal_sma3nz_amean': 0.019597552716732025, 'jitterLocal_sma3nz_stddevNorm': 0.9063860177993774, 'shimmerLocaldB_sma3nz_amean': 1.264746069908142, 'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757, 'HNRdBACF_sma3nz_amean': 3.6400067806243896, 'HNRdBACF_sma3nz_stddevNorm': 0.5911334753036499, 'logRelF0-H1-H2_sma3nz_amean': 1.215877652168274, 'logRelF0-H1-H2_sma3nz_stddevNorm': 3.883843183517456, 'logRelF0-H1-A3_sma3nz_amean': 18.830764770507812, 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.30870768427848816, 'F1frequency_sma3nz_amean': 665.1713256835938, 'F1frequency_sma3nz_stddevNorm': 0.41958823800086975, 'F1bandwidth_sma3nz_amean': 1300.2757568359375, 'F1bandwidth_sma3nz_stddevNorm': 0.16334553062915802, 'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672, 'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125, 'F2frequency_sma3nz_amean': 1657.013916015625, 'F2frequency_sma3nz_stddevNorm': 0.17019854485988617, 'F2bandwidth_sma3nz_amean': 1105.7457275390625, 'F2bandwidth_sma3nz_stddevNorm': 0.24520403146743774, 'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094, 'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417, 'F3frequency_sma3nz_amean': 2601.6630859375, 'F3frequency_sma3nz_stddevNorm': 0.11457356810569763, 'F3bandwidth_sma3nz_amean': 1091.15087890625, 'F3bandwidth_sma3nz_stddevNorm': 0.3787318468093872, 'F3amplitudeLogRelF0_sma3nz_amean': -134.52105712890625, 'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.620308518409729, 'alphaRatioV_sma3nz_amean': -8.626543045043945, 'alphaRatioV_sma3nz_stddevNorm': -0.4953792095184326, 'hammarbergIndexV_sma3nz_amean': 16.796842575073242, 'hammarbergIndexV_sma3nz_stddevNorm': 0.3567312955856323, 'slopeV0-500_sma3nz_amean': 0.021949246525764465, 'slopeV0-500_sma3nz_stddevNorm': 1.0097224712371826, 'slopeV500-1500_sma3nz_amean': -0.008139753714203835, 'slopeV500-1500_sma3nz_stddevNorm': -1.6243411302566528, 'spectralFluxV_sma3nz_amean': 0.4831695556640625, 'spectralFluxV_sma3nz_stddevNorm': 0.48576226830482483, 'mfcc1V_sma3nz_amean': 20.25444793701172, 'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944, 'mfcc2V_sma3nz_amean': 3.619405746459961, 'mfcc2V_sma3nz_stddevNorm': 2.1765975952148438, 'mfcc3V_sma3nz_amean': 7.736487865447998, 'mfcc3V_sma3nz_stddevNorm': 1.8630998134613037, 'mfcc4V_sma3nz_amean': 4.60503625869751, 'mfcc4V_sma3nz_stddevNorm': 2.864668846130371, 'alphaRatioUV_sma3nz_amean': -2.5990121364593506, 'hammarbergIndexUV_sma3nz_amean': 8.862899780273438, 'slopeUV0-500_sma3nz_amean': 0.002166695659980178, 'slopeUV500-1500_sma3nz_amean': 0.006735736038535833, 'spectralFluxUV_sma3nz_amean': 0.24703539907932281, 'loudnessPeaksPerSec': 3.8834950923919678, 'VoicedSegmentsPerSec': 2.745098114013672, 'MeanVoicedSegmentLengthSec': 0.12214285880327225, 'StddevVoicedSegmentLengthSec': 0.09025190770626068, 'MeanUnvoicedSegmentLength': 0.20666664838790894, 'StddevUnvoicedSegmentLength': 0.17666037380695343, 'equivalentSoundLevel_dBp': -24.297256469726562}, 'torchaudio': {'pitch': tensor([484.8485, 484.8485, 470.5882, 372.0930, 340.4255, 320.0000, 296.2963, 140.3509, 135.5932, 126.9841, 124.0310, 124.0310, 113.4752, 110.3448, 110.3448, 108.8435, 105.9603, 108.8435, 110.3448, 113.4752, 113.4752, 124.0310, 113.4752, 113.4752, 108.8435, 105.9603, 105.9603, 105.9603, 106.6667, 105.9603, 105.9603, 104.5752, 104.5752, 104.5752, 104.5752, 101.2658, 101.2658, 100.6289, 100.6289, 100.0000, 100.0000, 98.1595, 98.1595, 98.1595, 95.8084, 95.8084, 95.8084, 95.2381, 95.2381, 94.6746, 91.9540, 91.9540, 91.9540, 91.9540, 91.9540, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.4286, 91.9540, 91.9540, 93.0233, 93.5673, 93.5673, 94.1176, 94.6746, 94.6746, 94.6746, 95.8084, 96.3855, 96.9697, 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 100.0000, 103.8961, 104.5752, 104.5752, 106.6667, 106.6667, 106.6667, 111.1111, 116.7883, 116.7883, 116.7883, 118.5185, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 123.0769, 123.0769, 125.9843, 125.9843, 125.9843, 123.0769, 123.0769, 122.1374, 122.1374, 121.2121, 121.2121, 121.2121, 120.3008, 117.6471, 117.6471, 117.6471, 108.8435, 108.1081, 106.6667, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 108.8435, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 105.2632, 119.4030, 119.4030, 120.3008, 120.3008, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 121.2121, 122.1374, 122.1374, 122.1374, 122.1374, 123.0769, 123.0769, 123.0769, 123.0769, 123.0769, 123.0769, 120.3008, 120.3008, 120.3008, 119.4030, 113.4752, 106.6667, 103.2258, 103.2258, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 96.9697, 97.5610, 97.5610, 97.5610, 97.5610, 97.5610, 98.1595, 100.0000, 100.6289, 100.6289, 100.6289, 100.6289, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 101.2658, 97.5610, 90.9091, 89.8876, 88.8889, 88.8889, 88.3978, 87.4317, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.0215, 86.4865, 86.4865, 86.4865, 86.4865, 86.4865, 87.4317, 87.9121, 87.9121, 87.9121, 89.8876, 90.9091, 90.9091, 90.9091, 90.9091, 90.9091, 91.4286, 91.4286, 91.4286, 92.4855, 92.4855, 93.0233, 93.0233, 93.0233, 93.5673, 93.5673, 95.2381, 95.2381, 100.0000, 101.9108, 112.6761, 112.6761, 112.6761, 122.1374, 122.1374, 122.1374, 130.0813, 126.9841, 126.9841, 130.0813, 130.0813, 130.0813, 130.0813, 137.9310, 130.0813, 130.0813, 130.0813, 126.9841, 125.9843, 126.9841, 125.9843, 125.9843, 125.9843, 125.9843, 125.9843, 126.9841, 126.9841, 130.0813, 130.0813, 126.9841, 130.0813, 130.0813, 132.2314, 130.0813, 130.0813, 132.2314, 134.4538, 134.4538, 135.5932, 135.5932, 137.9310, 135.5932, 135.5932, 135.5932, 135.5932, 137.9310, 137.9310, 140.3509, 141.5929, 141.5929, 141.5929, 144.1441, 144.1441, 149.5327, 149.5327, 149.5327, 141.5929, 141.5929, 141.5929, 149.5327, 149.5327, 153.8462, 160.0000, 160.0000, 160.0000, 160.0000, 160.0000, 163.2653, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 164.9485, 156.8627, 155.3398, 155.3398, 155.3398, 153.8462, 153.8462, 152.3810, 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 148.1481, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 145.4545, 152.3810, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 153.8462, 152.3810, 152.3810, 152.3810, 152.3810, 149.5327, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 148.1481, 146.7890, 148.1481, 148.1481, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 144.1441, 144.1441, 144.1441, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 142.8571, 144.1441, 144.1441, 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 145.4545, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 146.7890, 400.0000, 400.0000, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485, 484.8485]), 'mel_filter_bank': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]]), 'mfcc': tensor([[-6.2570e+02, -4.7505e+02, -3.1078e+02, ..., -6.3893e+02, -6.3893e+02, -6.3893e+02], [ 1.3593e+01, 1.9928e+01, 2.6022e+01, ..., 3.9824e-05, 3.9824e-05, 3.9824e-05], [ 7.3933e+00, -2.1680e+01, -1.4259e+01, ..., -1.3440e-05, -1.3440e-05, -1.3440e-05], ..., [ 1.8122e+00, -3.1072e+00, -3.7336e+00, ..., 7.0669e-05, 7.0669e-05, 7.0669e-05], [-2.7518e-01, -9.4738e+00, -2.3157e+00, ..., -1.7963e-04, -1.7963e-04, -1.7963e-04], [ 2.3144e-01, -6.4129e+00, -8.4420e+00, ..., -1.5891e-04, -1.5891e-04, -1.5891e-04]]), 'mel_spectrogram': tensor([[4.4167e-04, 1.0165e-02, 1.3079e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.0977e-04, 1.5698e-02, 1.5785e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [8.2318e-05, 1.4367e-02, 2.8095e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [3.6322e-05, 9.7330e-03, 5.4812e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [2.2802e-05, 1.2481e-02, 5.8374e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.3029e-05, 3.1305e-02, 7.9842e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]]), 'spectrogram': tensor([[3.5553e-06, 5.9962e-03, 2.7176e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [5.0707e-04, 1.1670e-02, 1.5016e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.1901e-04, 1.8529e-02, 1.8078e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [1.0302e-05, 3.5917e-03, 2.7169e-03, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [9.6637e-08, 1.3364e-03, 1.8495e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.4414e-05, 1.0598e-04, 2.8004e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]])}, 'parselmouth': ({'duration': 5.1613125, 'speaking_rate': 3.874983349680919, 'articulation_rate': 3.874983349680919, 'phonation_ratio': 1.0, 'pause_rate': 0.0, 'mean_pause_duration': 0.0, 'mean_f0_hertz': 118.59917806814313, 'std_f0_hertz': 30.232960797931817, 'mean_intensity_db': 69.76277128148347, 'std_intensity_db': 58.54414165935646, 'range_ratio_intensity_db': -0.25736445047981316, 'pitch_floor': 60.0, 'pitch_ceiling': 250.0, 'mean_hnr_db': 3.3285614070654375, 'std_hnr_db': 3.36490968797237, 'spectral_slope': -13.982306776816046, 'spectral_tilt': -0.004414961849917737, 'cepstral_peak_prominence_mean': 7.0388038514346825, 'cepstral_peak_prominence_std': 1.5672438573255245, 'mean_f1_loc': 613.4664268420964, 'std_f1_loc': 303.98235579059883, 'mean_b1_loc': 401.96960219300837, 'std_b1_loc': 400.9001719378358, 'mean_f2_loc': 1701.7755281579418, 'std_f2_loc': 325.4405394017738, 'mean_b2_loc': 434.542188503193, 'std_b2_loc': 380.8914612651878, 'spectral_gravity': 579.587511962247, 'spectral_std_dev': 651.3025011919739, 'spectral_skewness': 3.5879707548251045, 'spectral_kurtosis': 19.991495997865282, 'local_jitter': 0.02553484151620524, 'localabsolute_jitter': 0.00021392842618599855, 'rap_jitter': 0.012174051087556429, 'ppq5_jitter': 0.01597797849248675, 'ddp_jitter': 0.03652215326266929, 'local_shimmer': 0.1530474665829716, 'localDB_shimmer': 1.3511061323188314, 'apq3_shimmer': 0.0702984931637734, 'apq5_shimmer': 0.09680154282272849, 'apq11_shimmer': 0.19065409516266155, 'dda_shimmer': 0.2108954794913202},), 'torchaudio_squim': {'stoi': 0.9247563481330872, 'pesq': 1.3702949285507202, 'si_sdr': 11.71167278289795}}]