senselab.audio.tasks.features_extraction.praat_parselmouth

This module contains functions that extract features from audio files using the PRAAT library.

The initial implementation of this features extraction was started by Nicholas Cummins from King's College London and has since been further developed and maintained by the senselab community.

   1"""This module contains functions that extract features from audio files using the PRAAT library.
   2
   3The initial implementation of this features extraction was started by Nicholas Cummins
   4from King's College London and has since been further developed and maintained
   5by the senselab community.
   6"""
   7
   8import inspect
   9import os
  10from pathlib import Path
  11from typing import Any, Dict, List, Optional, Union
  12
  13import numpy as np
  14import pydra  # type: ignore
  15
  16from senselab.audio.data_structures import Audio
  17from senselab.utils.data_structures import logger
  18
  19try:
  20    import parselmouth  # type: ignore
  21
  22    PARSELMOUTH_AVAILABLE = True
  23except ModuleNotFoundError:
  24    PARSELMOUTH_AVAILABLE = False
  25
  26    class DummyParselmouth:
  27        """Dummy class for when parselmouth is not available.
  28
  29        This is helpful for type checking when parselmouth is not installed.
  30        """
  31
  32        def __init__(self) -> None:
  33            """Dummy constructor for when parselmouth is not available."""
  34            pass
  35
  36        def call(self, *args: object, **kwargs: object) -> None:  # type: ignore
  37            """Dummy method for when parselmouth is not available."""
  38
  39        class Sound:
  40            """Dummy class for when parselmouth is not available."""
  41
  42            def __init__(self, *args: object, **kwargs: object) -> None:
  43                """Dummy class for when parselmouth is not available."""
  44                pass
  45
  46    parselmouth = DummyParselmouth()
  47
  48
  49def get_sound(audio: Union[Path, Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
  50    """Get a sound object from a given audio file or Audio object.
  51
  52    Args:
  53        audio (Union[Path, Audio]): A path to an audio file or an Audio object.
  54        sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
  55
  56    Returns:
  57        parselmouth.Sound: A Parselmouth Sound object.
  58
  59    Raises:
  60        FileNotFoundError: If the file is not found at the given path.
  61    """
  62    if not PARSELMOUTH_AVAILABLE:
  63        raise ModuleNotFoundError(
  64            "`parselmouth` is not installed. "
  65            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
  66        )
  67
  68    try:
  69        # Loading the sound
  70        if isinstance(audio, Path):
  71            audio = audio.resolve()
  72            if not audio.exists():
  73                logger.error(f"File does not exist: {audio}")
  74                raise FileNotFoundError(f"File does not exist: {audio}")
  75            snd_full = parselmouth.Sound(str(audio))
  76        elif isinstance(audio, Audio):
  77            snd_full = parselmouth.Sound(audio.waveform, audio.sampling_rate)
  78
  79        # Preprocessing
  80        if parselmouth.praat.call(snd_full, "Get number of channels") > 1:
  81            snd_full = snd_full.convert_to_mono()
  82        if parselmouth.praat.call(snd_full, "Get sampling frequency") != sampling_rate:
  83            snd_full = parselmouth.praat.call(snd_full, "Resample", sampling_rate, 50)
  84            # Details of queery: https://www.fon.hum.uva.nl/praat/manual/Get_sampling_frequency.html
  85            # Details of conversion: https://www.fon.hum.uva.nl/praat/manual/Sound__Resample___.html
  86    except Exception as e:
  87        raise RuntimeError(f"Error loading sound: {e}")
  88    return snd_full
  89
  90
  91def extract_speech_rate(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
  92    """Extract speech timing and pausing features from a given sound object.
  93
  94    Args:
  95        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  96
  97    Returns:
  98        Dict[str, float]: A dictionary containing the following features:
  99
 100            - speaking_rate (float): Number of syllables divided by duration.
 101            - articulation_rate (float): Number of syllables divided by phonation time.
 102            - phonation_ratio (float): Phonation time divided by duration.
 103            - pause_rate (float): Number of pauses divided by duration.
 104            - mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
 105
 106    Examples:
 107        ```python
 108        >>> snd = parselmouth.Sound("path_to_audio.wav")
 109        >>> extract_speech_rate(snd)
 110        {
 111            'speaking_rate': 5.3,
 112            'articulation_rate': 4.7,
 113            'phonation_ratio': 0.9,
 114            'pause_rate': 2.1,
 115            'mean_pause_dur': 0.5
 116        }
 117        ```
 118
 119    Useful sources for this code:
 120
 121        - https://sites.google.com/view/uhm-o-meter/scripts/syllablenuclei_v3?pli=1
 122        - https://drive.google.com/file/d/1o3mNdN5FKTiYQC9GHB1XoZ8JJIGZk_AK/view
 123        - (2009 paper) https://doi.org/10.3758/BRM.41.2.385
 124        - (2021 paper) https://doi.org/10.1080/0969594X.2021.1951162
 125    """
 126    if not PARSELMOUTH_AVAILABLE:
 127        raise ModuleNotFoundError(
 128            "`parselmouth` is not installed. "
 129            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 130        )
 131
 132    try:
 133        # _____________________________________________________________________________________________________________
 134        # Load the sound object into parselmouth if it is an Audio object
 135        if not isinstance(snd, parselmouth.Sound):
 136            snd = get_sound(snd)
 137
 138        # _____________________________________________________________________________________________________________
 139        # Key pause detection hyperparameters
 140
 141        # Silence Threshold (dB) - standard setting to detect silence in the "To TextGrid (silences)" function.
 142        # The higher this number, the lower the chances of finding silent pauses
 143        silence_db = -25
 144
 145        # Minimum_dip_between_peaks_(dB) - if there are decreases in intensity
 146        # of at least this value surrounding the peak, the peak is labelled to be a syllable nucleus
 147        # I.e. the size of the dip between two possible peakes
 148        # The higher this number, the less syllables will be found
 149        # For clean and filtered signal use 4, if not use 2 (recommend thresholds)
 150        min_dip = 4
 151        # Code for determining if the signal not clean/filtered
 152        hnr = parselmouth.praat.call(
 153            snd.to_harmonicity_cc(), "Get mean", 0, 0
 154        )  # Note: (0,0) is the time range for extraction, setting both two zero tells praat to use the full file
 155        if hnr < 60:
 156            min_dip = 2
 157
 158        # Minimum pause duration (s): How long should a pause be to be counted as a silent pause?
 159        # The higher this number, the fewer pauses will be found
 160        min_pause = 0.3  # the default for this is 0.1 in Praat, the de Jong's script has this set at 0.3
 161        # Based on values in: Toward an understanding of fluency:
 162        # A microanalysis of nonnative speaker conversations (Riggenbach)
 163        # – Micropause (silence of .2s or less)
 164        # – Hesitation (silence of .3 to .4s)
 165        # – Unfilled pause (silence of .5s or more)
 166
 167        # ______________________________________________________________________________________________________________
 168        # Intensity information
 169
 170        intensity = snd.to_intensity(minimum_pitch=50, time_step=0.016, subtract_mean=True)
 171        # These are the setting recommended by de jong - minimum pitch” set to 50 Hz,.
 172        # With this parameter setting, we extract intensity smoothed over a time window of (3.2/minimum_pitch)=64 msec,
 173        #  with 16-msec time steps explanation on these calculations are found at:
 174        # https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 175
 176        min_intensity = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "Parabolic")  # time range, Interpolation
 177        max_intensity = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "Parabolic")  # time range, Interpolation
 178
 179        # Silince is detected by measuring whether the intensity is 25 dB below the 99% highest peak
 180        # 99% is chosen to eliminate short loud bursts in intensity that may not have been speech
 181
 182        # get .99 quantile to get maximum (without influence of non-speech sound bursts)
 183        max_99_intensity = parselmouth.praat.call(intensity, "Get quantile", 0, 0, 0.99)
 184
 185        # estimate Intensity threshold
 186        silence_db_1 = max_99_intensity + silence_db
 187        db_adjustment = max_intensity - max_99_intensity
 188        silence_db_2 = silence_db - db_adjustment
 189        if silence_db_1 < min_intensity:
 190            silence_db_1 = min_intensity
 191
 192        # ______________________________________________________________________________________________________________
 193        # Create a TextGrid in which the silent and sounding intervals, store these intervals
 194
 195        textgrid = parselmouth.praat.call(
 196            intensity, "To TextGrid (silences)", silence_db_2, min_pause, 0.1, "silent", "sounding"
 197        )
 198        # Hyperparameters:
 199        # Silence threshold (dB),
 200        # Minimum silent interval (s) - minimum duration for an interval to be considered as silent
 201        # Minimum sounding interval (s) - minimum duration for an interval to be not considered as silent
 202        # Silent interval label
 203        # Sounding interval label
 204
 205        # Loop through intervals and extract times of identified silent and sounding sections
 206        silencetier = parselmouth.praat.call(textgrid, "Extract tier", 1)
 207        silencetable = parselmouth.praat.call(silencetier, "Down to TableOfReal", "sounding")
 208        npauses = parselmouth.praat.call(silencetable, "Get number of rows")
 209
 210        phonation_time = 0
 211        for ipause in range(npauses):
 212            pause = ipause + 1
 213            beginsound = parselmouth.praat.call(silencetable, "Get value", pause, 1)
 214            endsound = parselmouth.praat.call(silencetable, "Get value", pause, 2)
 215            speakingdur = endsound - beginsound
 216
 217            phonation_time += speakingdur
 218
 219            # This is to remove the first (before first word) and last (after last word) silence from consideration
 220            if pause == 1:
 221                begin_speak = beginsound
 222            if pause == (npauses):
 223                end_speak = endsound
 224
 225        # ______________________________________________________________________________________________________________
 226        # Next block of code finds all possible peaks
 227
 228        # Convert intensity countor into sound representation
 229        intensity_matrix = parselmouth.praat.call(intensity, "Down to Matrix")  # convert intensity to 2d representation
 230
 231        # Convert intensity countor into sound representation
 232        sound_from_intensity_matrix = parselmouth.praat.call(intensity_matrix, "To Sound (slice)", 1)
 233
 234        # find positive extrema, maxima in sound_from_intensity_matrix, which correspond to steepest rises in Intensity;
 235        point_process = parselmouth.praat.call(
 236            sound_from_intensity_matrix,
 237            "To PointProcess (extrema)",
 238            "Left",
 239            "yes",
 240            "no",
 241            "Sinc70",
 242        )
 243
 244        # estimate peak positions (all peaks)
 245        t = []
 246        numpeaks = parselmouth.praat.call(point_process, "Get number of points")
 247        for i in range(numpeaks):
 248            t.append(parselmouth.praat.call(point_process, "Get time from index", i + 1))
 249
 250        # ______________________________________________________________________________________________________________
 251        # Find the time and values of all peaks
 252
 253        # fill array with intensity values
 254        timepeaks = []
 255        peakcount = 0
 256        intensities = []
 257        for i in range(numpeaks):
 258            value = parselmouth.praat.call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
 259            if value > silence_db_1:
 260                peakcount += 1
 261                intensities.append(value)
 262                timepeaks.append(t[i])
 263
 264        # ______________________________________________________________________________________________________________
 265        # Now find all valid peaks
 266
 267        # fill array with valid peaks: only intensity values if preceding
 268        # dip in intensity is greater than min_dip
 269        validpeakcount = 0
 270        currenttime = timepeaks[0]
 271        currentint = intensities[0]
 272        validtime = []
 273
 274        for p in range(peakcount - 1):
 275            following = p + 1
 276            followingtime = timepeaks[following]
 277            dip = parselmouth.praat.call(
 278                intensity, "Get minimum", currenttime, followingtime, "None"
 279            )  # Gets minimiun value between two time points, doesn't intepolote/filter
 280            diffint = abs(currentint - dip)
 281            if diffint > min_dip:
 282                validpeakcount += 1
 283                validtime.append(timepeaks[p])
 284            # Update current time and intensity values for next loop
 285            currenttime = timepeaks[following]
 286            currentint = parselmouth.praat.call(intensity, "Get value at time", timepeaks[following], "Cubic")
 287
 288        # ______________________________________________________________________________________________________________
 289        # Extract voicing information
 290
 291        pitch = snd.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
 292        # Praat page for hyperparamters https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 293        # From de Jong's 2009 paper - We extract the pitch contour, this time using a window size of 100 msec
 294        # and 20-msec time steps, and exclude all peaks that are unvoiced
 295        # Key Hyperparamter are different to praat recommended - can't find a reason for this
 296        # time_step: Optional[Positive[float]] = None,  - set per De jong's recommendation
 297        # pitch_floor: Positive[float] = 75.0 set per dejong recommendation - 3/30 gives 100ms
 298        # max_number_of_candidates: Positive[int] = 15 (can't find a reason for this value being lower)
 299        # very_accurate: bool = False,
 300        # silence_threshold: float = 0.03,
 301        # voicing_threshold: float = 0.45, (can't find a reason for this value being different)
 302        # octave_cost: float = 0.01,
 303        # octave_jump_cost: float = 0.35,
 304        # voiced_unvoiced_cost: float = 0.14, (can't find a reason for this value being different)
 305        # pitch_ceiling: Positive[float] = 600.0 (can't find a reason for this value being lower, might change to value
 306        # from pitch_value function)
 307
 308        # ______________________________________________________________________________________________________________
 309        # Loop through valid peaks, count ones that are voiced (i.e., have valid pitch value at the same time)
 310
 311        number_syllables = int(0)
 312        for time in range(validpeakcount):
 313            querytime = validtime[time]
 314            whichinterval = parselmouth.praat.call(textgrid, "Get interval at time", 1, querytime)
 315            whichlabel = parselmouth.praat.call(textgrid, "Get label of interval", 1, whichinterval)
 316            value = pitch.get_value_at_time(querytime)
 317            if not np.isnan(value):
 318                if whichlabel == "sounding":
 319                    number_syllables += 1
 320
 321        # ______________________________________________________________________________________________________________
 322        # return results
 323
 324        original_dur = end_speak - begin_speak
 325
 326        speaking_rate = number_syllables / original_dur
 327        articulation_rate = number_syllables / phonation_time
 328        phonation_ratio = phonation_time / original_dur
 329
 330        number_pauses = npauses - 1
 331        pause_time = original_dur - phonation_time
 332
 333        pause_rate = number_pauses / original_dur
 334        mean_pause_dur = pause_time / number_pauses if number_pauses > 0 else 0.0
 335
 336        return {
 337            "speaking_rate": speaking_rate,
 338            "articulation_rate": articulation_rate,
 339            "phonation_ratio": phonation_ratio,
 340            "pause_rate": pause_rate,
 341            "mean_pause_dur": mean_pause_dur,
 342        }
 343
 344    except Exception as e:
 345        current_frame = inspect.currentframe()
 346        if current_frame is not None:
 347            current_function_name = current_frame.f_code.co_name
 348            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 349        return {
 350            "speaking_rate": np.nan,
 351            "articulation_rate": np.nan,
 352            "phonation_ratio": np.nan,
 353            "pause_rate": np.nan,
 354            "mean_pause_dur": np.nan,
 355        }
 356
 357
 358def extract_pitch_values(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
 359    """Estimate Pitch Range.
 360
 361    Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction
 362    algorithms.
 363
 364    Args:
 365        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 366
 367    Returns:
 368        dict: A dictionary containing the following keys:
 369
 370            - pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
 371            - pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
 372
 373    Notes:
 374        Values are taken from: [Standardization of pitch-range settings in voice acoustic analysis](https://doi.org/10.3758/BRM.41.2.318)
 375
 376        The problem observed with doing a really broad pitch search was the occasional error if F1 was low.
 377        So crude outlier detection is used to help with this.
 378
 379        Important: These values are used within other functions, they are not outputs of the full code.
 380
 381        Different pitch extraction methods in Praat:
 382
 383        - Cross-correlation (Praat default) vs auto-correlation pitch extraction:
 384        both are used in different functions below.
 385        - Cross-correlation is better than auto-correlation at finding period-level variation,
 386        such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
 387        - [Discussion on this on a Praat Forum](https://groups.io/g/Praat-Users-List/topic/pitch_detection_ac_vs_cc/78829266?p=,,,20,0,0,0::recentpostdate/sticky,,,20,2,20,78829266,previd=1612369050729515119,nextid=1605568402827788039&previd=1612369050729515119&nextid=1605568402827788039)
 388
 389    Examples:
 390        ```python
 391        >>> snd = parselmouth.Sound("path_to_audio.wav")
 392        >>> pitch_values(snd)
 393        {'pitch_floor': 60, 'pitch_ceiling': 250}
 394        ```
 395    """
 396    if not PARSELMOUTH_AVAILABLE:
 397        raise ModuleNotFoundError(
 398            "`parselmouth` is not installed. "
 399            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 400        )
 401
 402    try:
 403        if not isinstance(snd, parselmouth.Sound):
 404            snd = get_sound(snd)
 405
 406        pitch_wide = snd.to_pitch_ac(time_step=0.005, pitch_floor=50, pitch_ceiling=600)
 407        # Other than values above, I'm using default hyperparamters
 408        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 409
 410        # remove outliers from wide pitch search
 411        pitch_values = pitch_wide.selected_array["frequency"]
 412        pitch_values = pitch_values[pitch_values != 0]
 413        pitch_values_Z = (pitch_values - np.mean(pitch_values)) / np.std(pitch_values)
 414        pitch_values_filtered = pitch_values[abs(pitch_values_Z) <= 2]
 415
 416        mean_pitch = np.mean(pitch_values_filtered)
 417
 418        # Here there is an interesting alternative solution to discuss: https://praatscripting.lingphon.net/conditionals-1.html
 419        if mean_pitch < 170:
 420            # 'male' settings
 421            pitch_floor = 60.0
 422            pitch_ceiling = 250.0
 423        else:
 424            # 'female' and 'child' settings
 425            pitch_floor = 100.0
 426            pitch_ceiling = 500.0
 427
 428        return {"pitch_floor": pitch_floor, "pitch_ceiling": pitch_ceiling}
 429    except Exception as e:
 430        current_frame = inspect.currentframe()
 431        if current_frame is not None:
 432            current_function_name = current_frame.f_code.co_name
 433            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 434        return {"pitch_floor": np.nan, "pitch_ceiling": np.nan}
 435
 436
 437def extract_pitch_descriptors(
 438    snd: Union[parselmouth.Sound, Path, Audio],
 439    floor: float,
 440    ceiling: float,
 441    frame_shift: float = 0.005,
 442    unit: str = "Hertz",
 443) -> Dict[str, float]:
 444    """Extract Pitch Features.
 445
 446    Function to extract key pitch features from a given sound object.
 447    This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.
 448
 449    Args:
 450        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 451        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 452        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 453        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 454            Defaults to 0.005.
 455        unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz".
 456            Could be "semitones".
 457
 458    Returns:
 459        dict: A dictionary containing the following keys:
 460
 461            - mean_f0_{unit} (float): Mean pitch in {unit}.
 462            - stdev_f0_{unit} (float): Standard deviation in {unit}.
 463
 464    Notes:
 465        - Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
 466        - stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
 467
 468    Examples:
 469        ```python
 470        >>> snd = parselmouth.Sound("path_to_audio.wav")
 471        >>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
 472        {'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
 473        ```
 474    """
 475    if not PARSELMOUTH_AVAILABLE:
 476        raise ModuleNotFoundError(
 477            "`parselmouth` is not installed. "
 478            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 479        )
 480
 481    try:
 482        if not isinstance(snd, parselmouth.Sound):
 483            snd = get_sound(snd)
 484
 485        # Extract pitch object
 486        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 487        # Other than values above, I'm using default hyperparameters
 488        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 489
 490        # Extract mean, median, and standard deviation
 491        mean_f0 = parselmouth.praat.call(pitch, "Get mean", 0, 0, unit)  # time range, units
 492        stdev_f0 = parselmouth.praat.call(pitch, "Get standard deviation", 0, 0, unit)
 493
 494        # Return results
 495        return {f"mean_f0_{unit.lower()}": mean_f0, f"stdev_f0_{unit.lower()}": stdev_f0}
 496    except Exception as e:
 497        current_frame = inspect.currentframe()
 498        if current_frame is not None:
 499            current_function_name = current_frame.f_code.co_name
 500            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 501        return {f"mean_f0_{unit.lower()}": np.nan, f"stdev_f0_{unit.lower()}": np.nan}
 502
 503
 504def extract_intensity_descriptors(
 505    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
 506) -> Dict[str, float]:
 507    """Extract Intensity Features.
 508
 509    Function to extract key intensity information from a given sound object.
 510    This function is based on default Praat code adapted to work with Parselmouth.
 511
 512    Args:
 513        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 514        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 515        frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
 516
 517    Returns:
 518        dict: A dictionary containing the following keys:
 519
 520            - mean_db (float): Mean intensity in dB.
 521            - std_db (float): Standard deviation in dB.
 522            - range_db_ratio (float): Intensity range, expressed as a ratio in dB.
 523
 524    Examples:
 525        ```python
 526        >>> snd = parselmouth.Sound("path_to_audio.wav")
 527        >>> extract_intensity_descriptors(snd, 75, 0.01)
 528        {'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
 529        ```
 530
 531    Notes:
 532        - Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 533        - For notes on extracting mean settings: https://www.fon.hum.uva.nl/praat/manual/Intro_6_2__Configuring_the_intensity_contour.html
 534    """
 535    if not PARSELMOUTH_AVAILABLE:
 536        raise ModuleNotFoundError(
 537            "`parselmouth` is not installed. "
 538            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 539        )
 540
 541    try:
 542        if not isinstance(snd, parselmouth.Sound):
 543            snd = get_sound(snd)
 544
 545        # Extract intensity object
 546        intensity = snd.to_intensity(minimum_pitch=floor, time_step=frame_shift, subtract_mean=True)
 547        # Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 548
 549        # Extract descriptors
 550        mean_db = parselmouth.praat.call(
 551            intensity, "Get mean", 0, 0, "energy"
 552        )  # get mean - time range, time range, averaging method
 553        std_db = parselmouth.praat.call(intensity, "Get standard deviation", 0, 0)
 554        min_dB = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "parabolic")  # time range, Interpolation
 555        max_dB = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "parabolic")  # time range, Interpolation
 556        range_db_ratio = max_dB / min_dB
 557
 558        # Return results
 559        return {"mean_db": mean_db, "std_db": std_db, "range_db_ratio": range_db_ratio}
 560
 561    except Exception as e:
 562        current_frame = inspect.currentframe()
 563        if current_frame is not None:
 564            current_function_name = current_frame.f_code.co_name
 565            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 566        return {"mean_db": np.nan, "std_db": np.nan, "range_db_ratio": np.nan}
 567
 568
 569def extract_harmonicity_descriptors(
 570    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
 571) -> Dict[str, float]:
 572    """Voice Quality - HNR.
 573
 574    Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object.
 575    This function uses the CC method as recommended by Praat.
 576
 577    Args:
 578        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 579        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 580        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 581
 582    Returns:
 583        dict: A dictionary containing the following key:
 584
 585            - hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
 586            - hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
 587
 588    Examples:
 589        ```python
 590        >>> snd = parselmouth.Sound("path_to_audio.wav")
 591        >>> extract_harmonicity_descriptors(snd, 75, 0.01)
 592        {'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
 593        ```
 594
 595    Notes:
 596        - Praat recommends using the CC method: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
 597        - Default settings can be found at: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__ac____.html
 598    """
 599    if not PARSELMOUTH_AVAILABLE:
 600        raise ModuleNotFoundError(
 601            "`parselmouth` is not installed. "
 602            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 603        )
 604
 605    try:
 606        if not isinstance(snd, parselmouth.Sound):
 607            snd = get_sound(snd)
 608
 609        # Extract HNR information
 610        harmonicity = snd.to_harmonicity_cc(
 611            time_step=frame_shift, minimum_pitch=floor, silence_threshold=0.1, periods_per_window=4.5
 612        )
 613        # Praat recommends using the CC method here: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
 614
 615        hnr_db_mean = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)
 616        hnr_db_std_dev = parselmouth.praat.call(harmonicity, "Get standard deviation", 0, 0)
 617
 618        return {"hnr_db_mean": hnr_db_mean, "hnr_db_std_dev": hnr_db_std_dev}
 619    except Exception as e:
 620        current_frame = inspect.currentframe()
 621        if current_frame is not None:
 622            current_function_name = current_frame.f_code.co_name
 623            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 624
 625        return {"hnr_db_mean": np.nan, "hnr_db_std_dev": np.nan}
 626
 627
 628def extract_slope_tilt(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
 629    """Voice Quality - Spectral Slope/Tilt.
 630
 631    Function to extract spectral slope and tilt from a given sound object. This function is based on default
 632    Praat code adapted to work with Parselmouth.
 633
 634    Args:
 635        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 636        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 637        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 638
 639    Returns:
 640        dict: A dictionary containing the following keys:
 641
 642            - spectral_slope (float): Mean spectral slope.
 643            - spectral_tilt (float): Mean spectral tilt.
 644
 645    Examples:
 646        ```python
 647        >>> snd = parselmouth.Sound("path_to_audio.wav")
 648        >>> extract_slope_tilt(snd, 75, 500)
 649        {'spectral_slope': -0.8, 'spectral_tilt': -2.5}
 650        ```
 651
 652    Notes:
 653        - Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
 654        - Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
 655        - Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation:
 656        https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
 657    """
 658    if not PARSELMOUTH_AVAILABLE:
 659        raise ModuleNotFoundError(
 660            "`parselmouth` is not installed. "
 661            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 662        )
 663
 664    try:
 665        if not isinstance(snd, parselmouth.Sound):
 666            snd = get_sound(snd)
 667
 668        ltas_rep = parselmouth.praat.call(
 669            snd, "To Ltas (pitch-corrected)...", floor, ceiling, 5000, 100, 0.0001, 0.02, 1.3
 670        )
 671        # Hyperparameters: Min Pitch (Hz), Max Pitch (Hz), Maximum Frequency (Hz), Bandwidth (Hz), Shortest Period (s),
 672        # Longest Period (s), Maximum period factor
 673
 674        spectral_slope = parselmouth.praat.call(ltas_rep, "Get slope", 50, 1000, 1000, 4000, "dB")
 675        # Hyperparameters: f1min, f1max, f2min, f2max, averagingUnits
 676
 677        spectral_tilt_Report = parselmouth.praat.call(ltas_rep, "Report spectral tilt", 100, 5000, "Linear", "Robust")
 678        # Hyperparameters: minimumFrequency, maximumFrequency, Frequency Scale (linear or logarithmic),
 679        # Fit method (least squares or robust)
 680
 681        srt_st = spectral_tilt_Report.index("Slope: ") + len("Slope: ")
 682        end_st = spectral_tilt_Report.index("d", srt_st)
 683        spectral_tilt = float(spectral_tilt_Report[srt_st:end_st])
 684
 685        # Return results
 686        return {"spectral_slope": spectral_slope, "spectral_tilt": spectral_tilt}
 687
 688    except Exception as e:
 689        current_frame = inspect.currentframe()
 690        if current_frame is not None:
 691            current_function_name = current_frame.f_code.co_name
 692            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 693        return {"spectral_slope": np.nan, "spectral_tilt": np.nan}
 694
 695
 696def extract_cpp_descriptors(
 697    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
 698) -> Dict[str, float]:
 699    """Extract Cepstral Peak Prominence (CPP).
 700
 701    Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object.
 702    This function is adapted from default Praat code to work with Parselmouth.
 703
 704    Args:
 705        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 706        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 707        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 708        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 709
 710    Returns:
 711        dict: A dictionary containing the following key:
 712
 713            - mean_cpp (float): Mean Cepstral Peak Prominence.
 714            - std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
 715
 716    Examples:
 717        ```python
 718        >>> snd = parselmouth.Sound("path_to_audio.wav")
 719        >>> extract_CPP(snd, 75, 500, 0.01)
 720        {'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
 721        ```
 722
 723    Notes:
 724        - Cepstral Peak Prominence: The height (i.e., “prominence”) of that peak relative to a regression line
 725        through the overall cepstrum.
 726        - Adapted from: https://osf.io/ctwgr and http://phonetics.linguistics.ucla.edu/facilities/acoustic/voiced_extract_auto.txt
 727    """
 728    if not PARSELMOUTH_AVAILABLE:
 729        raise ModuleNotFoundError(
 730            "`parselmouth` is not installed. "
 731            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 732        )
 733
 734    try:
 735        if not isinstance(snd, parselmouth.Sound):
 736            snd = get_sound(snd)
 737
 738        # Extract pitch object for voiced checking
 739        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling, voicing_threshold=0.3)
 740
 741        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
 742
 743        textgrid = parselmouth.praat.call(pulses, "To TextGrid (vuv)", 0.02, 0.1)
 744
 745        vuv_table = parselmouth.praat.call(textgrid, "Down to Table", "no", 6, "yes", "no")
 746        # Variables - include line number, Time decimals, include tier names, include empty intervals
 747
 748        cpp_list = []
 749
 750        n_intervals = parselmouth.praat.call(vuv_table, "Get number of rows")
 751        for i in range(n_intervals):
 752            label = parselmouth.praat.call(vuv_table, "Get value", i + 1, "text")
 753            if label == "V":
 754                tmin = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmin")
 755                tmax = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmax")
 756                snd_segment = snd.extract_part(float(tmin), float(tmax))
 757
 758                PowerCepstrogram = parselmouth.praat.call(snd_segment, "To PowerCepstrogram", 60, 0.002, 5000, 50)
 759                # PowerCepstrogram (60-Hz pitch floor, 2-ms time step, 5-kHz maximum frequency,
 760                # and pre-emphasis from 50 Hz)
 761
 762                try:
 763                    CPP_Value = parselmouth.praat.call(
 764                        PowerCepstrogram,
 765                        "Get CPPS...",
 766                        "no",
 767                        0.01,
 768                        0.001,
 769                        60,
 770                        330,
 771                        0.05,
 772                        "parabolic",
 773                        0.001,
 774                        0,
 775                        "Straight",
 776                        "Robust",
 777                    )
 778                    # Subtract tilt before smoothing = “no”; time averaging window = 0.01 s;
 779                    # quefrency averaging window = 0.001 s;
 780                    # Peak search pitch range = 60–330 Hz; tolerance = 0.05; interpolation = “Parabolic”;
 781                    # tilt line frequency range = 0.001–0 s (no upper bound);
 782                    # Line type = “Straight”; fit method = “Robust.”
 783                except Exception as e:
 784                    current_frame = inspect.currentframe()
 785                    if current_frame is not None:
 786                        current_function_name = current_frame.f_code.co_name
 787                        logger.error(f'Error in "{current_function_name}": \n' + str(e))
 788                    CPP_Value = np.nan
 789
 790                if not np.isnan(CPP_Value) and CPP_Value > 4:
 791                    cpp_list.append(CPP_Value)
 792
 793        # Calculate Final Features
 794        if cpp_list:
 795            CPP_array = np.array(cpp_list)
 796            CPP_mean = np.mean(CPP_array)
 797            CPP_std = np.std(CPP_array)
 798        else:
 799            CPP_mean = np.nan
 800            CPP_std = np.nan
 801
 802        # Return Result
 803        return {"mean_cpp": CPP_mean, "std_dev_cpp": CPP_std}
 804
 805    except Exception as e:
 806        current_frame = inspect.currentframe()
 807        if current_frame is not None:
 808            current_function_name = current_frame.f_code.co_name
 809            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 810        return {"mean_cpp": np.nan, "std_dev_cpp": np.nan}
 811
 812
 813def measure_f1f2_formants_bandwidths(
 814    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
 815) -> Dict[str, float]:
 816    """Extract Formant Frequency Features.
 817
 818    Function to extract formant frequency features from a given sound object. This function is adapted from default
 819    Praat code to work with Parselmouth.
 820
 821    Args:
 822        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 823        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 824        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 825        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 826
 827    Returns:
 828        dict: A dictionary containing the following keys:
 829
 830            - f1_mean (float): Mean F1 location.
 831            - f1_std (float): Standard deviation of F1 location.
 832            - b1_mean (float): Mean F1 bandwidth.
 833            - b1_std (float): Standard deviation of F1 bandwidth.
 834            - f2_mean (float): Mean F2 location.
 835            - f2_std (float): Standard deviation of F2 location.
 836            - b2_mean (float): Mean F2 bandwidth.
 837            - b2_std (float): Standard deviation of F2 bandwidth.
 838
 839    Examples:
 840        ```python
 841        >>> snd = parselmouth.Sound("path_to_audio.wav")
 842        >>> measureFormants(snd, 75, 500, 0.01)
 843        {'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
 844        'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
 845        ```
 846
 847    Notes:
 848        - Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
 849        - Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
 850        - Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
 851        - Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the
 852          sound where the vocal folds come together, helping to align the formant measurements precisely with the
 853          pitch periods.
 854        - Adapted from code at this [link](https://osf.io/6dwr3/).
 855    """
 856    if not PARSELMOUTH_AVAILABLE:
 857        raise ModuleNotFoundError(
 858            "`parselmouth` is not installed. "
 859            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 860        )
 861
 862    try:
 863        if not isinstance(snd, parselmouth.Sound):
 864            snd = get_sound(snd)
 865
 866        # Extract formants
 867        formants = parselmouth.praat.call(snd, "To Formant (burg)", frame_shift, 5, 5000, 0.025, 50)
 868        # Key Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Formant__burg____.html
 869
 870        # Extract pitch using CC method
 871        pitch = snd.to_pitch_cc(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 872        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
 873
 874        F1_list, F2_list, B1_list, B2_list = [], [], [], []
 875        numPoints = parselmouth.praat.call(pulses, "Get number of points")
 876
 877        for point in range(1, numPoints + 1):
 878            t = parselmouth.praat.call(pulses, "Get time from index", point)
 879
 880            F1_value = parselmouth.praat.call(formants, "Get value at time", 1, t, "Hertz", "Linear")
 881            if not np.isnan(F1_value):
 882                F1_list.append(F1_value)
 883
 884            B1_value = parselmouth.praat.call(formants, "Get bandwidth at time", 1, t, "Hertz", "Linear")
 885            if not np.isnan(B1_value):
 886                B1_list.append(B1_value)
 887
 888            F2_value = parselmouth.praat.call(formants, "Get value at time", 2, t, "Hertz", "Linear")
 889            if not np.isnan(F2_value):
 890                F2_list.append(F2_value)
 891
 892            B2_value = parselmouth.praat.call(formants, "Get bandwidth at time", 2, t, "Hertz", "Linear")
 893            if not np.isnan(B2_value):
 894                B2_list.append(B2_value)
 895
 896        f1_mean, f1_std = (np.mean(F1_list), np.std(F1_list)) if F1_list else (np.nan, np.nan)
 897        b1_mean, b1_std = (np.mean(B1_list), np.std(B1_list)) if B1_list else (np.nan, np.nan)
 898        f2_mean, f2_std = (np.mean(F2_list), np.std(F2_list)) if F2_list else (np.nan, np.nan)
 899        b2_mean, b2_std = (np.mean(B2_list), np.std(B2_list)) if B2_list else (np.nan, np.nan)
 900
 901        return {
 902            "f1_mean": f1_mean,
 903            "f1_std": f1_std,
 904            "b1_mean": b1_mean,
 905            "b1_std": b1_std,
 906            "f2_mean": f2_mean,
 907            "f2_std": f2_std,
 908            "b2_mean": b2_mean,
 909            "b2_std": b2_std,
 910        }
 911
 912    except Exception as e:
 913        current_frame = inspect.currentframe()
 914        if current_frame is not None:
 915            current_function_name = current_frame.f_code.co_name
 916            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 917        return {
 918            "f1_mean": np.nan,
 919            "f1_std": np.nan,
 920            "b1_mean": np.nan,
 921            "b1_std": np.nan,
 922            "f2_mean": np.nan,
 923            "f2_std": np.nan,
 924            "b2_mean": np.nan,
 925            "b2_std": np.nan,
 926        }
 927
 928
 929def extract_spectral_moments(
 930    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, window_size: float, frame_shift: float
 931) -> Dict[str, float]:
 932    """Extract Spectral Moments.
 933
 934    Function to extract spectral moments from a given sound object. This function is adapted from default
 935    Praat code to work with Parselmouth.
 936
 937    Args:
 938        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 939        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 940        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 941        window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
 942        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 943
 944    Returns:
 945        dict: A dictionary containing the following keys:
 946
 947            - spectral_gravity (float): Mean spectral gravity.
 948            - spectral_std_dev (float): Mean spectral standard deviation.
 949            - spectral_skewness (float): Mean spectral skewness.
 950            - spectral_kurtosis (float): Mean spectral kurtosis.
 951
 952    Examples:
 953        ```python
 954        >>> snd = parselmouth.Sound("path_to_audio.wav")
 955        >>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
 956        {'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
 957        ```
 958
 959    Notes:
 960        - Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency
 961        domain weighted by the power spectrum.
 962        - Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre
 963        of gravity.
 964        - Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different
 965        from the shape above the mean frequency.
 966        - Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different
 967          from a Gaussian shape.
 968        - Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
 969    """
 970    if not PARSELMOUTH_AVAILABLE:
 971        raise ModuleNotFoundError(
 972            "`parselmouth` is not installed. "
 973            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 974        )
 975
 976    try:
 977        if not isinstance(snd, parselmouth.Sound):
 978            snd = get_sound(snd)
 979
 980        # Extract pitch object for voiced checking
 981        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 982
 983        # Calculate Spectrogram
 984        spectrogram = snd.to_spectrogram(window_length=window_size, time_step=frame_shift)
 985        # Using default settings other than window length and frame shift
 986        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Spectrogram___.html
 987
 988        Gravity_list, STD_list, Skew_list, Kurt_list = [], [], [], []
 989
 990        num_steps = parselmouth.praat.call(spectrogram, "Get number of frames")
 991        for i in range(1, num_steps + 1):
 992            t = parselmouth.praat.call(spectrogram, "Get time from frame number", i)
 993            pitch_value = pitch.get_value_at_time(t)
 994
 995            if not np.isnan(pitch_value):
 996                voiced_spectrum = spectrogram.to_spectrum_slice(t)
 997                # Details: https://www.fon.hum.uva.nl/praat/manual/Spectrogram__To_Spectrum__slice____.html
 998
 999                Gravity_LLD = voiced_spectrum.get_centre_of_gravity(power=2)
1000                if not np.isnan(Gravity_LLD):
1001                    Gravity_list.append(Gravity_LLD)
1002
1003                STD_LLD = voiced_spectrum.get_standard_deviation(power=2)
1004                if not np.isnan(STD_LLD):
1005                    STD_list.append(STD_LLD)
1006
1007                Skew_LLD = voiced_spectrum.get_skewness(power=2)
1008                if not np.isnan(Skew_LLD):
1009                    Skew_list.append(Skew_LLD)
1010
1011                Kurt_LLD = voiced_spectrum.get_kurtosis(power=2)
1012                if not np.isnan(Kurt_LLD):
1013                    Kurt_list.append(Kurt_LLD)
1014
1015        gravity_mean = np.mean(Gravity_list) if Gravity_list else np.nan
1016        std_mean = np.mean(STD_list) if STD_list else np.nan
1017        skew_mean = np.mean(Skew_list) if Skew_list else np.nan
1018        kurt_mean = np.mean(Kurt_list) if Kurt_list else np.nan
1019
1020        return {
1021            "spectral_gravity": gravity_mean,
1022            "spectral_std_dev": std_mean,
1023            "spectral_skewness": skew_mean,
1024            "spectral_kurtosis": kurt_mean,
1025        }
1026
1027    except Exception as e:
1028        current_frame = inspect.currentframe()
1029        if current_frame is not None:
1030            current_function_name = current_frame.f_code.co_name
1031            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1032        return {
1033            "spectral_gravity": np.nan,
1034            "spectral_std_dev": np.nan,
1035            "spectral_skewness": np.nan,
1036            "spectral_kurtosis": np.nan,
1037        }
1038
1039
1040### More functions ###
1041
1042
1043def extract_audio_duration(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
1044    """Get the duration of a given audio file or Audio object.
1045
1046    This function calculates the total duration of an audio file or audio object
1047    by creating a Parselmouth `Sound` object and then calling a Praat method
1048    to retrieve the duration of the audio in seconds.
1049
1050    Args:
1051        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
1052        a file path (Path), or an `Audio` object containing the audio waveform and
1053        its corresponding sampling rate.
1054
1055    Returns:
1056        Dict[str, float]: A dictionary containing:
1057            - "duration" (float): The total duration of the audio in seconds.
1058
1059    Raises:
1060        FileNotFoundError: If a provided file path does not exist.
1061
1062    Example:
1063        ```python
1064        >>> snd = Audio(waveform=[...], sampling_rate=16000)
1065        >>> extract_audio_duration(snd)
1066        {'duration': 5.23}
1067        ```
1068    """
1069    if not PARSELMOUTH_AVAILABLE:
1070        raise ModuleNotFoundError(
1071            "`parselmouth` is not installed. "
1072            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1073        )
1074
1075    # Check if the input is a Path, in which case we load the audio from the file
1076    if not isinstance(snd, parselmouth.Sound):
1077        snd = get_sound(snd)
1078
1079    try:
1080        # Get the total duration of the sound
1081        duration = parselmouth.praat.call(snd, "Get total duration")
1082
1083        # Return the duration in a dictionary
1084        return {"duration": duration}
1085    except Exception as e:
1086        current_frame = inspect.currentframe()
1087        if current_frame is not None:
1088            current_function_name = current_frame.f_code.co_name
1089            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1090        return {"duration": np.nan}
1091
1092
1093def extract_jitter(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1094    """Returns the jitter descriptors for the given sound or audio file.
1095
1096    Args:
1097        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1098        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1099        floor (float): Minimum fundamental frequency (F0) in Hz.
1100        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1101
1102    Returns:
1103        Dict[str, float]: A dictionary containing various jitter measurements.
1104    """
1105
1106    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1107        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1108
1109    def _extract_jitter(type: str, point_process: parselmouth.Data) -> float:
1110        return parselmouth.praat.call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3)
1111
1112    if not PARSELMOUTH_AVAILABLE:
1113        raise ModuleNotFoundError(
1114            "`parselmouth` is not installed. "
1115            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1116        )
1117
1118    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1119    if not isinstance(snd, parselmouth.Sound):
1120        snd = get_sound(snd)
1121
1122    try:
1123        # Convert the sound to a point process for jitter measurement
1124        point_process = _to_point_process(snd, floor, ceiling)
1125
1126        # Extract jitter measures from the point process
1127        return {
1128            "local_jitter": _extract_jitter("local", point_process),
1129            "localabsolute_jitter": _extract_jitter("local, absolute", point_process),
1130            "rap_jitter": _extract_jitter("rap", point_process),
1131            "ppq5_jitter": _extract_jitter("ppq5", point_process),
1132            "ddp_jitter": _extract_jitter("ddp", point_process),
1133        }
1134
1135    except Exception as e:
1136        current_frame = inspect.currentframe()
1137        if current_frame is not None:
1138            current_function_name = current_frame.f_code.co_name
1139            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1140        return {
1141            "local_jitter": np.nan,
1142            "localabsolute_jitter": np.nan,
1143            "rap_jitter": np.nan,
1144            "ppq5_jitter": np.nan,
1145            "ddp_jitter": np.nan,
1146        }
1147
1148
1149def extract_shimmer(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1150    """Returns the shimmer descriptors for the given sound or audio file.
1151
1152    Args:
1153        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1154        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1155        floor (float): Minimum fundamental frequency (F0) in Hz.
1156        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1157
1158    Returns:
1159        Dict[str, float]: A dictionary containing various shimmer measurements.
1160    """
1161
1162    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1163        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1164
1165    def _extract_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float:
1166        return parselmouth.praat.call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6)
1167
1168    if not PARSELMOUTH_AVAILABLE:
1169        raise ModuleNotFoundError(
1170            "`parselmouth` is not installed. "
1171            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1172        )
1173
1174    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1175    if not isinstance(snd, parselmouth.Sound):
1176        snd = get_sound(snd)
1177
1178    try:
1179        # Convert the sound to a point process for shimmer measurement
1180        point_process = _to_point_process(snd, floor, ceiling)
1181
1182        # Extract shimmer measures from the sound and point process
1183        return {
1184            "local_shimmer": _extract_shimmer("local", snd, point_process),
1185            "localDB_shimmer": _extract_shimmer("local_dB", snd, point_process),
1186            "apq3_shimmer": _extract_shimmer("apq3", snd, point_process),
1187            "apq5_shimmer": _extract_shimmer("apq5", snd, point_process),
1188            "apq11_shimmer": _extract_shimmer("apq11", snd, point_process),
1189            "dda_shimmer": _extract_shimmer("dda", snd, point_process),
1190        }
1191
1192    except Exception as e:
1193        current_frame = inspect.currentframe()
1194        if current_frame is not None:
1195            current_function_name = current_frame.f_code.co_name
1196            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1197        return {
1198            "local_shimmer": np.nan,
1199            "localDB_shimmer": np.nan,
1200            "apq3_shimmer": np.nan,
1201            "apq5_shimmer": np.nan,
1202            "apq11_shimmer": np.nan,
1203            "dda_shimmer": np.nan,
1204        }
1205
1206
1207### Wrapper ###
1208def extract_praat_parselmouth_features_from_audios(
1209    audios: List[Audio],
1210    time_step: float = 0.005,
1211    window_length: float = 0.025,
1212    pitch_unit: str = "Hertz",
1213    cache_dir: Optional[str | os.PathLike] = None,
1214    speech_rate: bool = True,
1215    intensity_descriptors: bool = True,
1216    harmonicity_descriptors: bool = True,
1217    formants: bool = True,
1218    spectral_moments: bool = True,
1219    pitch: bool = True,
1220    slope_tilt: bool = True,
1221    cpp_descriptors: bool = True,
1222    duration: bool = True,
1223    jitter: bool = True,
1224    shimmer: bool = True,
1225    plugin: str = "serial",
1226    plugin_args: Dict[str, Any] = {},
1227) -> List[Dict[str, Any]]:
1228    """Extract features from a list of Audio objects and return a JSON-like dictionary.
1229
1230    Args:
1231        audios (list): List of Audio objects to extract features from.
1232        pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
1233        time_step (float): Time rate at which to extract features. Defaults to 0.005.
1234        window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
1235        cache_dir (Optional[str]): Directory to use for caching by pydra. Defaults to None.
1236        speech_rate (bool): Whether to extract speech rate. Defaults to True.
1237        intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
1238        harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
1239        formants (bool): Whether to extract formants. Defaults to True.
1240        spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
1241        pitch (bool): Whether to extract pitch. Defaults to True.
1242        slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
1243        cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
1244        duration (bool): Whether to extract duration. Defaults to True.
1245        jitter (bool): Whether to extract jitter. Defaults to True.
1246        shimmer (bool): Whether to extract shimmer. Defaults to True.
1247        plugin (str): Plugin to use for feature extraction. Defaults to "serial".
1248        plugin_args (Optional[Dict[str, Any]]): Arguments for the pydra plugin. Defaults to {}.
1249
1250    Returns:
1251        dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".
1252    """
1253    # Mark tasks with Pydra
1254    extract_pitch_values_pt = pydra.mark.task(extract_pitch_values)
1255
1256    def _extract_pitch_floor(pitch_values_out: dict) -> float:
1257        return pitch_values_out["pitch_floor"]
1258
1259    _extract_pitch_floor_pt = pydra.mark.task(_extract_pitch_floor)
1260
1261    def _extract_pitch_ceiling(pitch_values_out: dict) -> float:
1262        return pitch_values_out["pitch_ceiling"]
1263
1264    _extract_pitch_ceiling_pt = pydra.mark.task(_extract_pitch_ceiling)
1265    if speech_rate:
1266        extract_speech_rate_pt = pydra.mark.task(extract_speech_rate)
1267    if intensity_descriptors:
1268        extract_intensity_descriptors_pt = pydra.mark.task(extract_intensity_descriptors)
1269    if harmonicity_descriptors:
1270        extract_harmonicity_descriptors_pt = pydra.mark.task(extract_harmonicity_descriptors)
1271    if formants:
1272        measure_f1f2_formants_bandwidths_pt = pydra.mark.task(measure_f1f2_formants_bandwidths)
1273    if spectral_moments:
1274        extract_spectral_moments_pt = pydra.mark.task(extract_spectral_moments)
1275    if pitch:
1276        extract_pitch_descriptors_pt = pydra.mark.task(extract_pitch_descriptors)
1277    if slope_tilt:
1278        extract_slope_tilt_pt = pydra.mark.task(extract_slope_tilt)
1279    if cpp_descriptors:
1280        extract_cpp_descriptors_pt = pydra.mark.task(extract_cpp_descriptors)
1281    if duration:
1282        extract_audio_duration_pt = pydra.mark.task(extract_audio_duration)
1283    if jitter:
1284        extract_jitter_pt = pydra.mark.task(extract_jitter)
1285    if shimmer:
1286        extract_shimmer_pt = pydra.mark.task(extract_shimmer)
1287
1288    # Create the workflow
1289    wf = pydra.Workflow(name="wf", input_spec=["x"], cache_dir=cache_dir)
1290    wf.split("x", x=audios)
1291    wf.add(extract_pitch_values_pt(name="extract_pitch_values_pt", snd=wf.lzin.x))
1292    wf.add(
1293        _extract_pitch_floor_pt(name="_extract_pitch_floor_pt", pitch_values_out=wf.extract_pitch_values_pt.lzout.out)
1294    )
1295    wf.add(
1296        _extract_pitch_ceiling_pt(
1297            name="_extract_pitch_ceiling_pt", pitch_values_out=wf.extract_pitch_values_pt.lzout.out
1298        )
1299    )
1300    if speech_rate:
1301        wf.add(extract_speech_rate_pt(name="extract_speech_rate_pt", snd=wf.lzin.x))
1302    if pitch:
1303        wf.add(
1304            extract_pitch_descriptors_pt(
1305                name="extract_pitch_descriptors_pt",
1306                snd=wf.lzin.x,
1307                floor=wf._extract_pitch_floor_pt.lzout.out,
1308                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1309                frame_shift=time_step,
1310                unit=pitch_unit,
1311            )
1312        )
1313    if intensity_descriptors:
1314        wf.add(
1315            extract_intensity_descriptors_pt(
1316                name="extract_intensity_descriptors_pt",
1317                snd=wf.lzin.x,
1318                floor=wf._extract_pitch_floor_pt.lzout.out,
1319                frame_shift=time_step,
1320            )
1321        )
1322    if harmonicity_descriptors:
1323        wf.add(
1324            extract_harmonicity_descriptors_pt(
1325                name="extract_harmonicity_descriptors_pt",
1326                snd=wf.lzin.x,
1327                floor=wf._extract_pitch_floor_pt.lzout.out,
1328                frame_shift=time_step,
1329            )
1330        )
1331    if formants:
1332        wf.add(
1333            measure_f1f2_formants_bandwidths_pt(
1334                name="measure_f1f2_formants_bandwidths_pt",
1335                snd=wf.lzin.x,
1336                floor=wf._extract_pitch_floor_pt.lzout.out,
1337                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1338                frame_shift=time_step,
1339            )
1340        )
1341    if spectral_moments:
1342        wf.add(
1343            extract_spectral_moments_pt(
1344                name="extract_spectral_moments_pt",
1345                snd=wf.lzin.x,
1346                floor=wf._extract_pitch_floor_pt.lzout.out,
1347                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1348                window_size=window_length,
1349                frame_shift=time_step,
1350            )
1351        )
1352    if slope_tilt:
1353        wf.add(
1354            extract_slope_tilt_pt(
1355                name="extract_slope_tilt_pt",
1356                snd=wf.lzin.x,
1357                floor=wf._extract_pitch_floor_pt.lzout.out,
1358                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1359            )
1360        )
1361    if cpp_descriptors:
1362        wf.add(
1363            extract_cpp_descriptors_pt(
1364                name="extract_cpp_descriptors_pt",
1365                snd=wf.lzin.x,
1366                floor=wf._extract_pitch_floor_pt.lzout.out,
1367                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1368                frame_shift=time_step,
1369            )
1370        )
1371    if duration:
1372        wf.add(extract_audio_duration_pt(name="extract_audio_duration_pt", snd=wf.lzin.x))
1373    if jitter:
1374        wf.add(
1375            extract_jitter_pt(
1376                name="extract_jitter_pt",
1377                snd=wf.lzin.x,
1378                floor=wf._extract_pitch_floor_pt.lzout.out,
1379                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1380            )
1381        )
1382    if shimmer:
1383        wf.add(
1384            extract_shimmer_pt(
1385                name="extract_shimmer_pt",
1386                snd=wf.lzin.x,
1387                floor=wf._extract_pitch_floor_pt.lzout.out,
1388                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1389            )
1390        )
1391
1392    # setting multiple workflow outputs
1393    output_connections = [("pitch_values_out", wf.extract_pitch_values_pt.lzout.out)]
1394    if speech_rate:
1395        output_connections.append(("speech_rate_out", wf.extract_speech_rate_pt.lzout.out))
1396    if pitch:
1397        output_connections.append(("pitch_out", wf.extract_pitch_descriptors_pt.lzout.out))
1398    if intensity_descriptors:
1399        output_connections.append(("intensity_out", wf.extract_intensity_descriptors_pt.lzout.out))
1400    if harmonicity_descriptors:
1401        output_connections.append(("harmonicity_out", wf.extract_harmonicity_descriptors_pt.lzout.out))
1402    if formants:
1403        output_connections.append(("formants_out", wf.measure_f1f2_formants_bandwidths_pt.lzout.out))
1404    if spectral_moments:
1405        output_connections.append(("spectral_moments_out", wf.extract_spectral_moments_pt.lzout.out))
1406    if slope_tilt:
1407        output_connections.append(("slope_tilt_out", wf.extract_slope_tilt_pt.lzout.out))
1408    if cpp_descriptors:
1409        output_connections.append(("cpp_out", wf.extract_cpp_descriptors_pt.lzout.out))
1410    if duration:
1411        output_connections.append(("audio_duration", wf.extract_audio_duration_pt.lzout.out))
1412    if jitter:
1413        output_connections.append(("jitter_out", wf.extract_jitter_pt.lzout.out))
1414    if shimmer:
1415        output_connections.append(("shimmer_out", wf.extract_shimmer_pt.lzout.out))
1416    wf.set_output(output_connections)
1417
1418    with pydra.Submitter(plugin=plugin, **plugin_args) as sub:
1419        sub(wf)
1420
1421    outputs = wf.result()
1422
1423    extracted_data = []
1424
1425    for output in outputs:
1426        feature_data = {}
1427        # Audio duration
1428        if duration:
1429            feature_data["duration"] = output.output.audio_duration["duration"]
1430        # Timing and Pausing
1431        if speech_rate:
1432            feature_data["speaking_rate"] = output.output.speech_rate_out["speaking_rate"]
1433            feature_data["articulation_rate"] = output.output.speech_rate_out["articulation_rate"]
1434            feature_data["phonation_ratio"] = output.output.speech_rate_out["phonation_ratio"]
1435            feature_data["pause_rate"] = output.output.speech_rate_out["pause_rate"]
1436            feature_data["mean_pause_duration"] = output.output.speech_rate_out["mean_pause_dur"]
1437        # Pitch and Intensity:
1438        if pitch:
1439            feature_data[f"mean_f0_{pitch_unit.lower()}"] = output.output.pitch_out[f"mean_f0_{pitch_unit.lower()}"]
1440            feature_data[f"std_f0_{pitch_unit.lower()}"] = output.output.pitch_out[f"stdev_f0_{pitch_unit.lower()}"]
1441            feature_data["mean_intensity_db"] = output.output.intensity_out["mean_db"]
1442            feature_data["std_intensity_db"] = output.output.intensity_out["std_db"]
1443            feature_data["range_ratio_intensity_db"] = output.output.intensity_out["range_db_ratio"]
1444            # feature_data["pitch_floor"] = output.output.pitch_values_out["pitch_floor"]
1445            # feature_data["pitch_ceiling"] = output.output.pitch_values_out["pitch_ceiling"]
1446        # Quality Features:
1447        if harmonicity_descriptors:
1448            feature_data["mean_hnr_db"] = output.output.harmonicity_out["hnr_db_mean"]
1449            feature_data["std_hnr_db"] = output.output.harmonicity_out["hnr_db_std_dev"]
1450            feature_data["spectral_slope"] = output.output.slope_tilt_out["spectral_slope"]
1451            feature_data["spectral_tilt"] = output.output.slope_tilt_out["spectral_tilt"]
1452            feature_data["cepstral_peak_prominence_mean"] = output.output.cpp_out["mean_cpp"]
1453            feature_data["cepstral_peak_prominence_std"] = output.output.cpp_out["std_dev_cpp"]
1454        # Formant (F1, F2):
1455        if formants:
1456            feature_data["mean_f1_loc"] = output.output.formants_out["f1_mean"]
1457            feature_data["std_f1_loc"] = output.output.formants_out["f1_std"]
1458            feature_data["mean_b1_loc"] = output.output.formants_out["b1_mean"]
1459            feature_data["std_b1_loc"] = output.output.formants_out["b1_std"]
1460            feature_data["mean_f2_loc"] = output.output.formants_out["f2_mean"]
1461            feature_data["std_f2_loc"] = output.output.formants_out["f2_std"]
1462            feature_data["mean_b2_loc"] = output.output.formants_out["b2_mean"]
1463            feature_data["std_b2_loc"] = output.output.formants_out["b2_std"]
1464        # Spectral Moments:
1465        if spectral_moments:
1466            feature_data["spectral_gravity"] = output.output.spectral_moments_out["spectral_gravity"]
1467            feature_data["spectral_std_dev"] = output.output.spectral_moments_out["spectral_std_dev"]
1468            feature_data["spectral_skewness"] = output.output.spectral_moments_out["spectral_skewness"]
1469            feature_data["spectral_kurtosis"] = output.output.spectral_moments_out["spectral_kurtosis"]
1470        # Jitter Descriptors:
1471        if jitter:
1472            feature_data["local_jitter"] = output.output.jitter_out["local_jitter"]
1473            feature_data["localabsolute_jitter"] = output.output.jitter_out["localabsolute_jitter"]
1474            feature_data["rap_jitter"] = output.output.jitter_out["rap_jitter"]
1475            feature_data["ppq5_jitter"] = output.output.jitter_out["ppq5_jitter"]
1476            feature_data["ddp_jitter"] = output.output.jitter_out["ddp_jitter"]
1477        # Shimmer Descriptors:
1478        if shimmer:
1479            feature_data["local_shimmer"] = output.output.shimmer_out["local_shimmer"]
1480            feature_data["localDB_shimmer"] = output.output.shimmer_out["localDB_shimmer"]
1481            feature_data["apq3_shimmer"] = output.output.shimmer_out["apq3_shimmer"]
1482            feature_data["apq5_shimmer"] = output.output.shimmer_out["apq5_shimmer"]
1483            feature_data["apq11_shimmer"] = output.output.shimmer_out["apq11_shimmer"]
1484            feature_data["dda_shimmer"] = output.output.shimmer_out["dda_shimmer"]
1485
1486        extracted_data.append(feature_data)
1487
1488    return extracted_data
def get_sound( audio: Union[pathlib.Path, senselab.audio.data_structures.audio.Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
50def get_sound(audio: Union[Path, Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
51    """Get a sound object from a given audio file or Audio object.
52
53    Args:
54        audio (Union[Path, Audio]): A path to an audio file or an Audio object.
55        sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
56
57    Returns:
58        parselmouth.Sound: A Parselmouth Sound object.
59
60    Raises:
61        FileNotFoundError: If the file is not found at the given path.
62    """
63    if not PARSELMOUTH_AVAILABLE:
64        raise ModuleNotFoundError(
65            "`parselmouth` is not installed. "
66            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
67        )
68
69    try:
70        # Loading the sound
71        if isinstance(audio, Path):
72            audio = audio.resolve()
73            if not audio.exists():
74                logger.error(f"File does not exist: {audio}")
75                raise FileNotFoundError(f"File does not exist: {audio}")
76            snd_full = parselmouth.Sound(str(audio))
77        elif isinstance(audio, Audio):
78            snd_full = parselmouth.Sound(audio.waveform, audio.sampling_rate)
79
80        # Preprocessing
81        if parselmouth.praat.call(snd_full, "Get number of channels") > 1:
82            snd_full = snd_full.convert_to_mono()
83        if parselmouth.praat.call(snd_full, "Get sampling frequency") != sampling_rate:
84            snd_full = parselmouth.praat.call(snd_full, "Resample", sampling_rate, 50)
85            # Details of queery: https://www.fon.hum.uva.nl/praat/manual/Get_sampling_frequency.html
86            # Details of conversion: https://www.fon.hum.uva.nl/praat/manual/Sound__Resample___.html
87    except Exception as e:
88        raise RuntimeError(f"Error loading sound: {e}")
89    return snd_full

Get a sound object from a given audio file or Audio object.

Arguments:
  • audio (Union[Path, Audio]): A path to an audio file or an Audio object.
  • sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
Returns:

parselmouth.Sound: A Parselmouth Sound object.

Raises:
  • FileNotFoundError: If the file is not found at the given path.
def extract_speech_rate( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
 92def extract_speech_rate(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
 93    """Extract speech timing and pausing features from a given sound object.
 94
 95    Args:
 96        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 97
 98    Returns:
 99        Dict[str, float]: A dictionary containing the following features:
100
101            - speaking_rate (float): Number of syllables divided by duration.
102            - articulation_rate (float): Number of syllables divided by phonation time.
103            - phonation_ratio (float): Phonation time divided by duration.
104            - pause_rate (float): Number of pauses divided by duration.
105            - mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
106
107    Examples:
108        ```python
109        >>> snd = parselmouth.Sound("path_to_audio.wav")
110        >>> extract_speech_rate(snd)
111        {
112            'speaking_rate': 5.3,
113            'articulation_rate': 4.7,
114            'phonation_ratio': 0.9,
115            'pause_rate': 2.1,
116            'mean_pause_dur': 0.5
117        }
118        ```
119
120    Useful sources for this code:
121
122        - https://sites.google.com/view/uhm-o-meter/scripts/syllablenuclei_v3?pli=1
123        - https://drive.google.com/file/d/1o3mNdN5FKTiYQC9GHB1XoZ8JJIGZk_AK/view
124        - (2009 paper) https://doi.org/10.3758/BRM.41.2.385
125        - (2021 paper) https://doi.org/10.1080/0969594X.2021.1951162
126    """
127    if not PARSELMOUTH_AVAILABLE:
128        raise ModuleNotFoundError(
129            "`parselmouth` is not installed. "
130            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
131        )
132
133    try:
134        # _____________________________________________________________________________________________________________
135        # Load the sound object into parselmouth if it is an Audio object
136        if not isinstance(snd, parselmouth.Sound):
137            snd = get_sound(snd)
138
139        # _____________________________________________________________________________________________________________
140        # Key pause detection hyperparameters
141
142        # Silence Threshold (dB) - standard setting to detect silence in the "To TextGrid (silences)" function.
143        # The higher this number, the lower the chances of finding silent pauses
144        silence_db = -25
145
146        # Minimum_dip_between_peaks_(dB) - if there are decreases in intensity
147        # of at least this value surrounding the peak, the peak is labelled to be a syllable nucleus
148        # I.e. the size of the dip between two possible peakes
149        # The higher this number, the less syllables will be found
150        # For clean and filtered signal use 4, if not use 2 (recommend thresholds)
151        min_dip = 4
152        # Code for determining if the signal not clean/filtered
153        hnr = parselmouth.praat.call(
154            snd.to_harmonicity_cc(), "Get mean", 0, 0
155        )  # Note: (0,0) is the time range for extraction, setting both two zero tells praat to use the full file
156        if hnr < 60:
157            min_dip = 2
158
159        # Minimum pause duration (s): How long should a pause be to be counted as a silent pause?
160        # The higher this number, the fewer pauses will be found
161        min_pause = 0.3  # the default for this is 0.1 in Praat, the de Jong's script has this set at 0.3
162        # Based on values in: Toward an understanding of fluency:
163        # A microanalysis of nonnative speaker conversations (Riggenbach)
164        # – Micropause (silence of .2s or less)
165        # – Hesitation (silence of .3 to .4s)
166        # – Unfilled pause (silence of .5s or more)
167
168        # ______________________________________________________________________________________________________________
169        # Intensity information
170
171        intensity = snd.to_intensity(minimum_pitch=50, time_step=0.016, subtract_mean=True)
172        # These are the setting recommended by de jong - minimum pitch” set to 50 Hz,.
173        # With this parameter setting, we extract intensity smoothed over a time window of (3.2/minimum_pitch)=64 msec,
174        #  with 16-msec time steps explanation on these calculations are found at:
175        # https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
176
177        min_intensity = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "Parabolic")  # time range, Interpolation
178        max_intensity = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "Parabolic")  # time range, Interpolation
179
180        # Silince is detected by measuring whether the intensity is 25 dB below the 99% highest peak
181        # 99% is chosen to eliminate short loud bursts in intensity that may not have been speech
182
183        # get .99 quantile to get maximum (without influence of non-speech sound bursts)
184        max_99_intensity = parselmouth.praat.call(intensity, "Get quantile", 0, 0, 0.99)
185
186        # estimate Intensity threshold
187        silence_db_1 = max_99_intensity + silence_db
188        db_adjustment = max_intensity - max_99_intensity
189        silence_db_2 = silence_db - db_adjustment
190        if silence_db_1 < min_intensity:
191            silence_db_1 = min_intensity
192
193        # ______________________________________________________________________________________________________________
194        # Create a TextGrid in which the silent and sounding intervals, store these intervals
195
196        textgrid = parselmouth.praat.call(
197            intensity, "To TextGrid (silences)", silence_db_2, min_pause, 0.1, "silent", "sounding"
198        )
199        # Hyperparameters:
200        # Silence threshold (dB),
201        # Minimum silent interval (s) - minimum duration for an interval to be considered as silent
202        # Minimum sounding interval (s) - minimum duration for an interval to be not considered as silent
203        # Silent interval label
204        # Sounding interval label
205
206        # Loop through intervals and extract times of identified silent and sounding sections
207        silencetier = parselmouth.praat.call(textgrid, "Extract tier", 1)
208        silencetable = parselmouth.praat.call(silencetier, "Down to TableOfReal", "sounding")
209        npauses = parselmouth.praat.call(silencetable, "Get number of rows")
210
211        phonation_time = 0
212        for ipause in range(npauses):
213            pause = ipause + 1
214            beginsound = parselmouth.praat.call(silencetable, "Get value", pause, 1)
215            endsound = parselmouth.praat.call(silencetable, "Get value", pause, 2)
216            speakingdur = endsound - beginsound
217
218            phonation_time += speakingdur
219
220            # This is to remove the first (before first word) and last (after last word) silence from consideration
221            if pause == 1:
222                begin_speak = beginsound
223            if pause == (npauses):
224                end_speak = endsound
225
226        # ______________________________________________________________________________________________________________
227        # Next block of code finds all possible peaks
228
229        # Convert intensity countor into sound representation
230        intensity_matrix = parselmouth.praat.call(intensity, "Down to Matrix")  # convert intensity to 2d representation
231
232        # Convert intensity countor into sound representation
233        sound_from_intensity_matrix = parselmouth.praat.call(intensity_matrix, "To Sound (slice)", 1)
234
235        # find positive extrema, maxima in sound_from_intensity_matrix, which correspond to steepest rises in Intensity;
236        point_process = parselmouth.praat.call(
237            sound_from_intensity_matrix,
238            "To PointProcess (extrema)",
239            "Left",
240            "yes",
241            "no",
242            "Sinc70",
243        )
244
245        # estimate peak positions (all peaks)
246        t = []
247        numpeaks = parselmouth.praat.call(point_process, "Get number of points")
248        for i in range(numpeaks):
249            t.append(parselmouth.praat.call(point_process, "Get time from index", i + 1))
250
251        # ______________________________________________________________________________________________________________
252        # Find the time and values of all peaks
253
254        # fill array with intensity values
255        timepeaks = []
256        peakcount = 0
257        intensities = []
258        for i in range(numpeaks):
259            value = parselmouth.praat.call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
260            if value > silence_db_1:
261                peakcount += 1
262                intensities.append(value)
263                timepeaks.append(t[i])
264
265        # ______________________________________________________________________________________________________________
266        # Now find all valid peaks
267
268        # fill array with valid peaks: only intensity values if preceding
269        # dip in intensity is greater than min_dip
270        validpeakcount = 0
271        currenttime = timepeaks[0]
272        currentint = intensities[0]
273        validtime = []
274
275        for p in range(peakcount - 1):
276            following = p + 1
277            followingtime = timepeaks[following]
278            dip = parselmouth.praat.call(
279                intensity, "Get minimum", currenttime, followingtime, "None"
280            )  # Gets minimiun value between two time points, doesn't intepolote/filter
281            diffint = abs(currentint - dip)
282            if diffint > min_dip:
283                validpeakcount += 1
284                validtime.append(timepeaks[p])
285            # Update current time and intensity values for next loop
286            currenttime = timepeaks[following]
287            currentint = parselmouth.praat.call(intensity, "Get value at time", timepeaks[following], "Cubic")
288
289        # ______________________________________________________________________________________________________________
290        # Extract voicing information
291
292        pitch = snd.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
293        # Praat page for hyperparamters https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
294        # From de Jong's 2009 paper - We extract the pitch contour, this time using a window size of 100 msec
295        # and 20-msec time steps, and exclude all peaks that are unvoiced
296        # Key Hyperparamter are different to praat recommended - can't find a reason for this
297        # time_step: Optional[Positive[float]] = None,  - set per De jong's recommendation
298        # pitch_floor: Positive[float] = 75.0 set per dejong recommendation - 3/30 gives 100ms
299        # max_number_of_candidates: Positive[int] = 15 (can't find a reason for this value being lower)
300        # very_accurate: bool = False,
301        # silence_threshold: float = 0.03,
302        # voicing_threshold: float = 0.45, (can't find a reason for this value being different)
303        # octave_cost: float = 0.01,
304        # octave_jump_cost: float = 0.35,
305        # voiced_unvoiced_cost: float = 0.14, (can't find a reason for this value being different)
306        # pitch_ceiling: Positive[float] = 600.0 (can't find a reason for this value being lower, might change to value
307        # from pitch_value function)
308
309        # ______________________________________________________________________________________________________________
310        # Loop through valid peaks, count ones that are voiced (i.e., have valid pitch value at the same time)
311
312        number_syllables = int(0)
313        for time in range(validpeakcount):
314            querytime = validtime[time]
315            whichinterval = parselmouth.praat.call(textgrid, "Get interval at time", 1, querytime)
316            whichlabel = parselmouth.praat.call(textgrid, "Get label of interval", 1, whichinterval)
317            value = pitch.get_value_at_time(querytime)
318            if not np.isnan(value):
319                if whichlabel == "sounding":
320                    number_syllables += 1
321
322        # ______________________________________________________________________________________________________________
323        # return results
324
325        original_dur = end_speak - begin_speak
326
327        speaking_rate = number_syllables / original_dur
328        articulation_rate = number_syllables / phonation_time
329        phonation_ratio = phonation_time / original_dur
330
331        number_pauses = npauses - 1
332        pause_time = original_dur - phonation_time
333
334        pause_rate = number_pauses / original_dur
335        mean_pause_dur = pause_time / number_pauses if number_pauses > 0 else 0.0
336
337        return {
338            "speaking_rate": speaking_rate,
339            "articulation_rate": articulation_rate,
340            "phonation_ratio": phonation_ratio,
341            "pause_rate": pause_rate,
342            "mean_pause_dur": mean_pause_dur,
343        }
344
345    except Exception as e:
346        current_frame = inspect.currentframe()
347        if current_frame is not None:
348            current_function_name = current_frame.f_code.co_name
349            logger.error(f'Error in "{current_function_name}": \n' + str(e))
350        return {
351            "speaking_rate": np.nan,
352            "articulation_rate": np.nan,
353            "phonation_ratio": np.nan,
354            "pause_rate": np.nan,
355            "mean_pause_dur": np.nan,
356        }

Extract speech timing and pausing features from a given sound object.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
Returns:

Dict[str, float]: A dictionary containing the following features:

- speaking_rate (float): Number of syllables divided by duration.
- articulation_rate (float): Number of syllables divided by phonation time.
- phonation_ratio (float): Phonation time divided by duration.
- pause_rate (float): Number of pauses divided by duration.
- mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_speech_rate(snd)
{
    'speaking_rate': 5.3,
    'articulation_rate': 4.7,
    'phonation_ratio': 0.9,
    'pause_rate': 2.1,
    'mean_pause_dur': 0.5
}
Useful sources for this code:
def extract_pitch_values( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
359def extract_pitch_values(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
360    """Estimate Pitch Range.
361
362    Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction
363    algorithms.
364
365    Args:
366        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
367
368    Returns:
369        dict: A dictionary containing the following keys:
370
371            - pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
372            - pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
373
374    Notes:
375        Values are taken from: [Standardization of pitch-range settings in voice acoustic analysis](https://doi.org/10.3758/BRM.41.2.318)
376
377        The problem observed with doing a really broad pitch search was the occasional error if F1 was low.
378        So crude outlier detection is used to help with this.
379
380        Important: These values are used within other functions, they are not outputs of the full code.
381
382        Different pitch extraction methods in Praat:
383
384        - Cross-correlation (Praat default) vs auto-correlation pitch extraction:
385        both are used in different functions below.
386        - Cross-correlation is better than auto-correlation at finding period-level variation,
387        such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
388        - [Discussion on this on a Praat Forum](https://groups.io/g/Praat-Users-List/topic/pitch_detection_ac_vs_cc/78829266?p=,,,20,0,0,0::recentpostdate/sticky,,,20,2,20,78829266,previd=1612369050729515119,nextid=1605568402827788039&previd=1612369050729515119&nextid=1605568402827788039)
389
390    Examples:
391        ```python
392        >>> snd = parselmouth.Sound("path_to_audio.wav")
393        >>> pitch_values(snd)
394        {'pitch_floor': 60, 'pitch_ceiling': 250}
395        ```
396    """
397    if not PARSELMOUTH_AVAILABLE:
398        raise ModuleNotFoundError(
399            "`parselmouth` is not installed. "
400            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
401        )
402
403    try:
404        if not isinstance(snd, parselmouth.Sound):
405            snd = get_sound(snd)
406
407        pitch_wide = snd.to_pitch_ac(time_step=0.005, pitch_floor=50, pitch_ceiling=600)
408        # Other than values above, I'm using default hyperparamters
409        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
410
411        # remove outliers from wide pitch search
412        pitch_values = pitch_wide.selected_array["frequency"]
413        pitch_values = pitch_values[pitch_values != 0]
414        pitch_values_Z = (pitch_values - np.mean(pitch_values)) / np.std(pitch_values)
415        pitch_values_filtered = pitch_values[abs(pitch_values_Z) <= 2]
416
417        mean_pitch = np.mean(pitch_values_filtered)
418
419        # Here there is an interesting alternative solution to discuss: https://praatscripting.lingphon.net/conditionals-1.html
420        if mean_pitch < 170:
421            # 'male' settings
422            pitch_floor = 60.0
423            pitch_ceiling = 250.0
424        else:
425            # 'female' and 'child' settings
426            pitch_floor = 100.0
427            pitch_ceiling = 500.0
428
429        return {"pitch_floor": pitch_floor, "pitch_ceiling": pitch_ceiling}
430    except Exception as e:
431        current_frame = inspect.currentframe()
432        if current_frame is not None:
433            current_function_name = current_frame.f_code.co_name
434            logger.error(f'Error in "{current_function_name}": \n' + str(e))
435        return {"pitch_floor": np.nan, "pitch_ceiling": np.nan}

Estimate Pitch Range.

Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction algorithms.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
Returns:

dict: A dictionary containing the following keys:

- pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
- pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
Notes:

Values are taken from: Standardization of pitch-range settings in voice acoustic analysis

The problem observed with doing a really broad pitch search was the occasional error if F1 was low. So crude outlier detection is used to help with this.

Important: These values are used within other functions, they are not outputs of the full code.

Different pitch extraction methods in Praat:

  • Cross-correlation (Praat default) vs auto-correlation pitch extraction: both are used in different functions below.
  • Cross-correlation is better than auto-correlation at finding period-level variation, such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
  • Discussion on this on a Praat Forum
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> pitch_values(snd)
{'pitch_floor': 60, 'pitch_ceiling': 250}
def extract_pitch_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float = 0.005, unit: str = 'Hertz') -> Dict[str, float]:
438def extract_pitch_descriptors(
439    snd: Union[parselmouth.Sound, Path, Audio],
440    floor: float,
441    ceiling: float,
442    frame_shift: float = 0.005,
443    unit: str = "Hertz",
444) -> Dict[str, float]:
445    """Extract Pitch Features.
446
447    Function to extract key pitch features from a given sound object.
448    This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.
449
450    Args:
451        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
452        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
453        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
454        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
455            Defaults to 0.005.
456        unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz".
457            Could be "semitones".
458
459    Returns:
460        dict: A dictionary containing the following keys:
461
462            - mean_f0_{unit} (float): Mean pitch in {unit}.
463            - stdev_f0_{unit} (float): Standard deviation in {unit}.
464
465    Notes:
466        - Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
467        - stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
468
469    Examples:
470        ```python
471        >>> snd = parselmouth.Sound("path_to_audio.wav")
472        >>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
473        {'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
474        ```
475    """
476    if not PARSELMOUTH_AVAILABLE:
477        raise ModuleNotFoundError(
478            "`parselmouth` is not installed. "
479            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
480        )
481
482    try:
483        if not isinstance(snd, parselmouth.Sound):
484            snd = get_sound(snd)
485
486        # Extract pitch object
487        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
488        # Other than values above, I'm using default hyperparameters
489        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
490
491        # Extract mean, median, and standard deviation
492        mean_f0 = parselmouth.praat.call(pitch, "Get mean", 0, 0, unit)  # time range, units
493        stdev_f0 = parselmouth.praat.call(pitch, "Get standard deviation", 0, 0, unit)
494
495        # Return results
496        return {f"mean_f0_{unit.lower()}": mean_f0, f"stdev_f0_{unit.lower()}": stdev_f0}
497    except Exception as e:
498        current_frame = inspect.currentframe()
499        if current_frame is not None:
500            current_function_name = current_frame.f_code.co_name
501            logger.error(f'Error in "{current_function_name}": \n' + str(e))
502        return {f"mean_f0_{unit.lower()}": np.nan, f"stdev_f0_{unit.lower()}": np.nan}

Extract Pitch Features.

Function to extract key pitch features from a given sound object. This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms. Defaults to 0.005.
  • unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz". Could be "semitones".
Returns:

dict: A dictionary containing the following keys:

- mean_f0_{unit} (float): Mean pitch in {unit}.
- stdev_f0_{unit} (float): Standard deviation in {unit}.
Notes:
  • Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
  • stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
{'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
def extract_intensity_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, frame_shift: float) -> Dict[str, float]:
505def extract_intensity_descriptors(
506    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
507) -> Dict[str, float]:
508    """Extract Intensity Features.
509
510    Function to extract key intensity information from a given sound object.
511    This function is based on default Praat code adapted to work with Parselmouth.
512
513    Args:
514        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
515        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
516        frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
517
518    Returns:
519        dict: A dictionary containing the following keys:
520
521            - mean_db (float): Mean intensity in dB.
522            - std_db (float): Standard deviation in dB.
523            - range_db_ratio (float): Intensity range, expressed as a ratio in dB.
524
525    Examples:
526        ```python
527        >>> snd = parselmouth.Sound("path_to_audio.wav")
528        >>> extract_intensity_descriptors(snd, 75, 0.01)
529        {'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
530        ```
531
532    Notes:
533        - Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
534        - For notes on extracting mean settings: https://www.fon.hum.uva.nl/praat/manual/Intro_6_2__Configuring_the_intensity_contour.html
535    """
536    if not PARSELMOUTH_AVAILABLE:
537        raise ModuleNotFoundError(
538            "`parselmouth` is not installed. "
539            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
540        )
541
542    try:
543        if not isinstance(snd, parselmouth.Sound):
544            snd = get_sound(snd)
545
546        # Extract intensity object
547        intensity = snd.to_intensity(minimum_pitch=floor, time_step=frame_shift, subtract_mean=True)
548        # Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
549
550        # Extract descriptors
551        mean_db = parselmouth.praat.call(
552            intensity, "Get mean", 0, 0, "energy"
553        )  # get mean - time range, time range, averaging method
554        std_db = parselmouth.praat.call(intensity, "Get standard deviation", 0, 0)
555        min_dB = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "parabolic")  # time range, Interpolation
556        max_dB = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "parabolic")  # time range, Interpolation
557        range_db_ratio = max_dB / min_dB
558
559        # Return results
560        return {"mean_db": mean_db, "std_db": std_db, "range_db_ratio": range_db_ratio}
561
562    except Exception as e:
563        current_frame = inspect.currentframe()
564        if current_frame is not None:
565            current_function_name = current_frame.f_code.co_name
566            logger.error(f'Error in "{current_function_name}": \n' + str(e))
567        return {"mean_db": np.nan, "std_db": np.nan, "range_db_ratio": np.nan}

Extract Intensity Features.

Function to extract key intensity information from a given sound object. This function is based on default Praat code adapted to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- mean_db (float): Mean intensity in dB.
- std_db (float): Standard deviation in dB.
- range_db_ratio (float): Intensity range, expressed as a ratio in dB.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_intensity_descriptors(snd, 75, 0.01)
{'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
Notes:
def extract_harmonicity_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, frame_shift: float) -> Dict[str, float]:
570def extract_harmonicity_descriptors(
571    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
572) -> Dict[str, float]:
573    """Voice Quality - HNR.
574
575    Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object.
576    This function uses the CC method as recommended by Praat.
577
578    Args:
579        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
580        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
581        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
582
583    Returns:
584        dict: A dictionary containing the following key:
585
586            - hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
587            - hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
588
589    Examples:
590        ```python
591        >>> snd = parselmouth.Sound("path_to_audio.wav")
592        >>> extract_harmonicity_descriptors(snd, 75, 0.01)
593        {'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
594        ```
595
596    Notes:
597        - Praat recommends using the CC method: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
598        - Default settings can be found at: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__ac____.html
599    """
600    if not PARSELMOUTH_AVAILABLE:
601        raise ModuleNotFoundError(
602            "`parselmouth` is not installed. "
603            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
604        )
605
606    try:
607        if not isinstance(snd, parselmouth.Sound):
608            snd = get_sound(snd)
609
610        # Extract HNR information
611        harmonicity = snd.to_harmonicity_cc(
612            time_step=frame_shift, minimum_pitch=floor, silence_threshold=0.1, periods_per_window=4.5
613        )
614        # Praat recommends using the CC method here: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
615
616        hnr_db_mean = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)
617        hnr_db_std_dev = parselmouth.praat.call(harmonicity, "Get standard deviation", 0, 0)
618
619        return {"hnr_db_mean": hnr_db_mean, "hnr_db_std_dev": hnr_db_std_dev}
620    except Exception as e:
621        current_frame = inspect.currentframe()
622        if current_frame is not None:
623            current_function_name = current_frame.f_code.co_name
624            logger.error(f'Error in "{current_function_name}": \n' + str(e))
625
626        return {"hnr_db_mean": np.nan, "hnr_db_std_dev": np.nan}

Voice Quality - HNR.

Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object. This function uses the CC method as recommended by Praat.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following key:

- hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
- hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_harmonicity_descriptors(snd, 75, 0.01)
{'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
Notes:
def extract_slope_tilt( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
629def extract_slope_tilt(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
630    """Voice Quality - Spectral Slope/Tilt.
631
632    Function to extract spectral slope and tilt from a given sound object. This function is based on default
633    Praat code adapted to work with Parselmouth.
634
635    Args:
636        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
637        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
638        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
639
640    Returns:
641        dict: A dictionary containing the following keys:
642
643            - spectral_slope (float): Mean spectral slope.
644            - spectral_tilt (float): Mean spectral tilt.
645
646    Examples:
647        ```python
648        >>> snd = parselmouth.Sound("path_to_audio.wav")
649        >>> extract_slope_tilt(snd, 75, 500)
650        {'spectral_slope': -0.8, 'spectral_tilt': -2.5}
651        ```
652
653    Notes:
654        - Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
655        - Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
656        - Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation:
657        https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
658    """
659    if not PARSELMOUTH_AVAILABLE:
660        raise ModuleNotFoundError(
661            "`parselmouth` is not installed. "
662            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
663        )
664
665    try:
666        if not isinstance(snd, parselmouth.Sound):
667            snd = get_sound(snd)
668
669        ltas_rep = parselmouth.praat.call(
670            snd, "To Ltas (pitch-corrected)...", floor, ceiling, 5000, 100, 0.0001, 0.02, 1.3
671        )
672        # Hyperparameters: Min Pitch (Hz), Max Pitch (Hz), Maximum Frequency (Hz), Bandwidth (Hz), Shortest Period (s),
673        # Longest Period (s), Maximum period factor
674
675        spectral_slope = parselmouth.praat.call(ltas_rep, "Get slope", 50, 1000, 1000, 4000, "dB")
676        # Hyperparameters: f1min, f1max, f2min, f2max, averagingUnits
677
678        spectral_tilt_Report = parselmouth.praat.call(ltas_rep, "Report spectral tilt", 100, 5000, "Linear", "Robust")
679        # Hyperparameters: minimumFrequency, maximumFrequency, Frequency Scale (linear or logarithmic),
680        # Fit method (least squares or robust)
681
682        srt_st = spectral_tilt_Report.index("Slope: ") + len("Slope: ")
683        end_st = spectral_tilt_Report.index("d", srt_st)
684        spectral_tilt = float(spectral_tilt_Report[srt_st:end_st])
685
686        # Return results
687        return {"spectral_slope": spectral_slope, "spectral_tilt": spectral_tilt}
688
689    except Exception as e:
690        current_frame = inspect.currentframe()
691        if current_frame is not None:
692            current_function_name = current_frame.f_code.co_name
693            logger.error(f'Error in "{current_function_name}": \n' + str(e))
694        return {"spectral_slope": np.nan, "spectral_tilt": np.nan}

Voice Quality - Spectral Slope/Tilt.

Function to extract spectral slope and tilt from a given sound object. This function is based on default Praat code adapted to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
Returns:

dict: A dictionary containing the following keys:

- spectral_slope (float): Mean spectral slope.
- spectral_tilt (float): Mean spectral tilt.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_slope_tilt(snd, 75, 500)
{'spectral_slope': -0.8, 'spectral_tilt': -2.5}
Notes:
  • Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
  • Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
  • Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation: https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
def extract_cpp_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float) -> Dict[str, float]:
697def extract_cpp_descriptors(
698    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
699) -> Dict[str, float]:
700    """Extract Cepstral Peak Prominence (CPP).
701
702    Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object.
703    This function is adapted from default Praat code to work with Parselmouth.
704
705    Args:
706        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
707        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
708        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
709        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
710
711    Returns:
712        dict: A dictionary containing the following key:
713
714            - mean_cpp (float): Mean Cepstral Peak Prominence.
715            - std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
716
717    Examples:
718        ```python
719        >>> snd = parselmouth.Sound("path_to_audio.wav")
720        >>> extract_CPP(snd, 75, 500, 0.01)
721        {'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
722        ```
723
724    Notes:
725        - Cepstral Peak Prominence: The height (i.e., “prominence”) of that peak relative to a regression line
726        through the overall cepstrum.
727        - Adapted from: https://osf.io/ctwgr and http://phonetics.linguistics.ucla.edu/facilities/acoustic/voiced_extract_auto.txt
728    """
729    if not PARSELMOUTH_AVAILABLE:
730        raise ModuleNotFoundError(
731            "`parselmouth` is not installed. "
732            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
733        )
734
735    try:
736        if not isinstance(snd, parselmouth.Sound):
737            snd = get_sound(snd)
738
739        # Extract pitch object for voiced checking
740        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling, voicing_threshold=0.3)
741
742        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
743
744        textgrid = parselmouth.praat.call(pulses, "To TextGrid (vuv)", 0.02, 0.1)
745
746        vuv_table = parselmouth.praat.call(textgrid, "Down to Table", "no", 6, "yes", "no")
747        # Variables - include line number, Time decimals, include tier names, include empty intervals
748
749        cpp_list = []
750
751        n_intervals = parselmouth.praat.call(vuv_table, "Get number of rows")
752        for i in range(n_intervals):
753            label = parselmouth.praat.call(vuv_table, "Get value", i + 1, "text")
754            if label == "V":
755                tmin = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmin")
756                tmax = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmax")
757                snd_segment = snd.extract_part(float(tmin), float(tmax))
758
759                PowerCepstrogram = parselmouth.praat.call(snd_segment, "To PowerCepstrogram", 60, 0.002, 5000, 50)
760                # PowerCepstrogram (60-Hz pitch floor, 2-ms time step, 5-kHz maximum frequency,
761                # and pre-emphasis from 50 Hz)
762
763                try:
764                    CPP_Value = parselmouth.praat.call(
765                        PowerCepstrogram,
766                        "Get CPPS...",
767                        "no",
768                        0.01,
769                        0.001,
770                        60,
771                        330,
772                        0.05,
773                        "parabolic",
774                        0.001,
775                        0,
776                        "Straight",
777                        "Robust",
778                    )
779                    # Subtract tilt before smoothing = “no”; time averaging window = 0.01 s;
780                    # quefrency averaging window = 0.001 s;
781                    # Peak search pitch range = 60–330 Hz; tolerance = 0.05; interpolation = “Parabolic”;
782                    # tilt line frequency range = 0.001–0 s (no upper bound);
783                    # Line type = “Straight”; fit method = “Robust.”
784                except Exception as e:
785                    current_frame = inspect.currentframe()
786                    if current_frame is not None:
787                        current_function_name = current_frame.f_code.co_name
788                        logger.error(f'Error in "{current_function_name}": \n' + str(e))
789                    CPP_Value = np.nan
790
791                if not np.isnan(CPP_Value) and CPP_Value > 4:
792                    cpp_list.append(CPP_Value)
793
794        # Calculate Final Features
795        if cpp_list:
796            CPP_array = np.array(cpp_list)
797            CPP_mean = np.mean(CPP_array)
798            CPP_std = np.std(CPP_array)
799        else:
800            CPP_mean = np.nan
801            CPP_std = np.nan
802
803        # Return Result
804        return {"mean_cpp": CPP_mean, "std_dev_cpp": CPP_std}
805
806    except Exception as e:
807        current_frame = inspect.currentframe()
808        if current_frame is not None:
809            current_function_name = current_frame.f_code.co_name
810            logger.error(f'Error in "{current_function_name}": \n' + str(e))
811        return {"mean_cpp": np.nan, "std_dev_cpp": np.nan}

Extract Cepstral Peak Prominence (CPP).

Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following key:

- mean_cpp (float): Mean Cepstral Peak Prominence.
- std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_CPP(snd, 75, 500, 0.01)
{'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
Notes:
def measure_f1f2_formants_bandwidths( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float) -> Dict[str, float]:
814def measure_f1f2_formants_bandwidths(
815    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
816) -> Dict[str, float]:
817    """Extract Formant Frequency Features.
818
819    Function to extract formant frequency features from a given sound object. This function is adapted from default
820    Praat code to work with Parselmouth.
821
822    Args:
823        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
824        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
825        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
826        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
827
828    Returns:
829        dict: A dictionary containing the following keys:
830
831            - f1_mean (float): Mean F1 location.
832            - f1_std (float): Standard deviation of F1 location.
833            - b1_mean (float): Mean F1 bandwidth.
834            - b1_std (float): Standard deviation of F1 bandwidth.
835            - f2_mean (float): Mean F2 location.
836            - f2_std (float): Standard deviation of F2 location.
837            - b2_mean (float): Mean F2 bandwidth.
838            - b2_std (float): Standard deviation of F2 bandwidth.
839
840    Examples:
841        ```python
842        >>> snd = parselmouth.Sound("path_to_audio.wav")
843        >>> measureFormants(snd, 75, 500, 0.01)
844        {'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
845        'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
846        ```
847
848    Notes:
849        - Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
850        - Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
851        - Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
852        - Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the
853          sound where the vocal folds come together, helping to align the formant measurements precisely with the
854          pitch periods.
855        - Adapted from code at this [link](https://osf.io/6dwr3/).
856    """
857    if not PARSELMOUTH_AVAILABLE:
858        raise ModuleNotFoundError(
859            "`parselmouth` is not installed. "
860            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
861        )
862
863    try:
864        if not isinstance(snd, parselmouth.Sound):
865            snd = get_sound(snd)
866
867        # Extract formants
868        formants = parselmouth.praat.call(snd, "To Formant (burg)", frame_shift, 5, 5000, 0.025, 50)
869        # Key Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Formant__burg____.html
870
871        # Extract pitch using CC method
872        pitch = snd.to_pitch_cc(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
873        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
874
875        F1_list, F2_list, B1_list, B2_list = [], [], [], []
876        numPoints = parselmouth.praat.call(pulses, "Get number of points")
877
878        for point in range(1, numPoints + 1):
879            t = parselmouth.praat.call(pulses, "Get time from index", point)
880
881            F1_value = parselmouth.praat.call(formants, "Get value at time", 1, t, "Hertz", "Linear")
882            if not np.isnan(F1_value):
883                F1_list.append(F1_value)
884
885            B1_value = parselmouth.praat.call(formants, "Get bandwidth at time", 1, t, "Hertz", "Linear")
886            if not np.isnan(B1_value):
887                B1_list.append(B1_value)
888
889            F2_value = parselmouth.praat.call(formants, "Get value at time", 2, t, "Hertz", "Linear")
890            if not np.isnan(F2_value):
891                F2_list.append(F2_value)
892
893            B2_value = parselmouth.praat.call(formants, "Get bandwidth at time", 2, t, "Hertz", "Linear")
894            if not np.isnan(B2_value):
895                B2_list.append(B2_value)
896
897        f1_mean, f1_std = (np.mean(F1_list), np.std(F1_list)) if F1_list else (np.nan, np.nan)
898        b1_mean, b1_std = (np.mean(B1_list), np.std(B1_list)) if B1_list else (np.nan, np.nan)
899        f2_mean, f2_std = (np.mean(F2_list), np.std(F2_list)) if F2_list else (np.nan, np.nan)
900        b2_mean, b2_std = (np.mean(B2_list), np.std(B2_list)) if B2_list else (np.nan, np.nan)
901
902        return {
903            "f1_mean": f1_mean,
904            "f1_std": f1_std,
905            "b1_mean": b1_mean,
906            "b1_std": b1_std,
907            "f2_mean": f2_mean,
908            "f2_std": f2_std,
909            "b2_mean": b2_mean,
910            "b2_std": b2_std,
911        }
912
913    except Exception as e:
914        current_frame = inspect.currentframe()
915        if current_frame is not None:
916            current_function_name = current_frame.f_code.co_name
917            logger.error(f'Error in "{current_function_name}": \n' + str(e))
918        return {
919            "f1_mean": np.nan,
920            "f1_std": np.nan,
921            "b1_mean": np.nan,
922            "b1_std": np.nan,
923            "f2_mean": np.nan,
924            "f2_std": np.nan,
925            "b2_mean": np.nan,
926            "b2_std": np.nan,
927        }

Extract Formant Frequency Features.

Function to extract formant frequency features from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- f1_mean (float): Mean F1 location.
- f1_std (float): Standard deviation of F1 location.
- b1_mean (float): Mean F1 bandwidth.
- b1_std (float): Standard deviation of F1 bandwidth.
- f2_mean (float): Mean F2 location.
- f2_std (float): Standard deviation of F2 location.
- b2_mean (float): Mean F2 bandwidth.
- b2_std (float): Standard deviation of F2 bandwidth.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> measureFormants(snd, 75, 500, 0.01)
{'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
Notes:
  • Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
  • Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
  • Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
  • Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the sound where the vocal folds come together, helping to align the formant measurements precisely with the pitch periods.
  • Adapted from code at this link.
def extract_spectral_moments( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, window_size: float, frame_shift: float) -> Dict[str, float]:
 930def extract_spectral_moments(
 931    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, window_size: float, frame_shift: float
 932) -> Dict[str, float]:
 933    """Extract Spectral Moments.
 934
 935    Function to extract spectral moments from a given sound object. This function is adapted from default
 936    Praat code to work with Parselmouth.
 937
 938    Args:
 939        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 940        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 941        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 942        window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
 943        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 944
 945    Returns:
 946        dict: A dictionary containing the following keys:
 947
 948            - spectral_gravity (float): Mean spectral gravity.
 949            - spectral_std_dev (float): Mean spectral standard deviation.
 950            - spectral_skewness (float): Mean spectral skewness.
 951            - spectral_kurtosis (float): Mean spectral kurtosis.
 952
 953    Examples:
 954        ```python
 955        >>> snd = parselmouth.Sound("path_to_audio.wav")
 956        >>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
 957        {'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
 958        ```
 959
 960    Notes:
 961        - Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency
 962        domain weighted by the power spectrum.
 963        - Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre
 964        of gravity.
 965        - Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different
 966        from the shape above the mean frequency.
 967        - Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different
 968          from a Gaussian shape.
 969        - Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
 970    """
 971    if not PARSELMOUTH_AVAILABLE:
 972        raise ModuleNotFoundError(
 973            "`parselmouth` is not installed. "
 974            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
 975        )
 976
 977    try:
 978        if not isinstance(snd, parselmouth.Sound):
 979            snd = get_sound(snd)
 980
 981        # Extract pitch object for voiced checking
 982        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 983
 984        # Calculate Spectrogram
 985        spectrogram = snd.to_spectrogram(window_length=window_size, time_step=frame_shift)
 986        # Using default settings other than window length and frame shift
 987        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Spectrogram___.html
 988
 989        Gravity_list, STD_list, Skew_list, Kurt_list = [], [], [], []
 990
 991        num_steps = parselmouth.praat.call(spectrogram, "Get number of frames")
 992        for i in range(1, num_steps + 1):
 993            t = parselmouth.praat.call(spectrogram, "Get time from frame number", i)
 994            pitch_value = pitch.get_value_at_time(t)
 995
 996            if not np.isnan(pitch_value):
 997                voiced_spectrum = spectrogram.to_spectrum_slice(t)
 998                # Details: https://www.fon.hum.uva.nl/praat/manual/Spectrogram__To_Spectrum__slice____.html
 999
1000                Gravity_LLD = voiced_spectrum.get_centre_of_gravity(power=2)
1001                if not np.isnan(Gravity_LLD):
1002                    Gravity_list.append(Gravity_LLD)
1003
1004                STD_LLD = voiced_spectrum.get_standard_deviation(power=2)
1005                if not np.isnan(STD_LLD):
1006                    STD_list.append(STD_LLD)
1007
1008                Skew_LLD = voiced_spectrum.get_skewness(power=2)
1009                if not np.isnan(Skew_LLD):
1010                    Skew_list.append(Skew_LLD)
1011
1012                Kurt_LLD = voiced_spectrum.get_kurtosis(power=2)
1013                if not np.isnan(Kurt_LLD):
1014                    Kurt_list.append(Kurt_LLD)
1015
1016        gravity_mean = np.mean(Gravity_list) if Gravity_list else np.nan
1017        std_mean = np.mean(STD_list) if STD_list else np.nan
1018        skew_mean = np.mean(Skew_list) if Skew_list else np.nan
1019        kurt_mean = np.mean(Kurt_list) if Kurt_list else np.nan
1020
1021        return {
1022            "spectral_gravity": gravity_mean,
1023            "spectral_std_dev": std_mean,
1024            "spectral_skewness": skew_mean,
1025            "spectral_kurtosis": kurt_mean,
1026        }
1027
1028    except Exception as e:
1029        current_frame = inspect.currentframe()
1030        if current_frame is not None:
1031            current_function_name = current_frame.f_code.co_name
1032            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1033        return {
1034            "spectral_gravity": np.nan,
1035            "spectral_std_dev": np.nan,
1036            "spectral_skewness": np.nan,
1037            "spectral_kurtosis": np.nan,
1038        }

Extract Spectral Moments.

Function to extract spectral moments from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- spectral_gravity (float): Mean spectral gravity.
- spectral_std_dev (float): Mean spectral standard deviation.
- spectral_skewness (float): Mean spectral skewness.
- spectral_kurtosis (float): Mean spectral kurtosis.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
{'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
Notes:
  • Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency domain weighted by the power spectrum.
  • Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre of gravity.
  • Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different from the shape above the mean frequency.
  • Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different from a Gaussian shape.
  • Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
def extract_audio_duration( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
1044def extract_audio_duration(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
1045    """Get the duration of a given audio file or Audio object.
1046
1047    This function calculates the total duration of an audio file or audio object
1048    by creating a Parselmouth `Sound` object and then calling a Praat method
1049    to retrieve the duration of the audio in seconds.
1050
1051    Args:
1052        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
1053        a file path (Path), or an `Audio` object containing the audio waveform and
1054        its corresponding sampling rate.
1055
1056    Returns:
1057        Dict[str, float]: A dictionary containing:
1058            - "duration" (float): The total duration of the audio in seconds.
1059
1060    Raises:
1061        FileNotFoundError: If a provided file path does not exist.
1062
1063    Example:
1064        ```python
1065        >>> snd = Audio(waveform=[...], sampling_rate=16000)
1066        >>> extract_audio_duration(snd)
1067        {'duration': 5.23}
1068        ```
1069    """
1070    if not PARSELMOUTH_AVAILABLE:
1071        raise ModuleNotFoundError(
1072            "`parselmouth` is not installed. "
1073            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1074        )
1075
1076    # Check if the input is a Path, in which case we load the audio from the file
1077    if not isinstance(snd, parselmouth.Sound):
1078        snd = get_sound(snd)
1079
1080    try:
1081        # Get the total duration of the sound
1082        duration = parselmouth.praat.call(snd, "Get total duration")
1083
1084        # Return the duration in a dictionary
1085        return {"duration": duration}
1086    except Exception as e:
1087        current_frame = inspect.currentframe()
1088        if current_frame is not None:
1089            current_function_name = current_frame.f_code.co_name
1090            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1091        return {"duration": np.nan}

Get the duration of a given audio file or Audio object.

This function calculates the total duration of an audio file or audio object by creating a Parselmouth Sound object and then calling a Praat method to retrieve the duration of the audio in seconds.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
  • a file path (Path), or an Audio object containing the audio waveform and
  • its corresponding sampling rate.
Returns:

Dict[str, float]: A dictionary containing: - "duration" (float): The total duration of the audio in seconds.

Raises:
  • FileNotFoundError: If a provided file path does not exist.
Example:
>>> snd = Audio(waveform=[...], sampling_rate=16000)
>>> extract_audio_duration(snd)
{'duration': 5.23}
def extract_jitter( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
1094def extract_jitter(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1095    """Returns the jitter descriptors for the given sound or audio file.
1096
1097    Args:
1098        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1099        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1100        floor (float): Minimum fundamental frequency (F0) in Hz.
1101        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1102
1103    Returns:
1104        Dict[str, float]: A dictionary containing various jitter measurements.
1105    """
1106
1107    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1108        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1109
1110    def _extract_jitter(type: str, point_process: parselmouth.Data) -> float:
1111        return parselmouth.praat.call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3)
1112
1113    if not PARSELMOUTH_AVAILABLE:
1114        raise ModuleNotFoundError(
1115            "`parselmouth` is not installed. "
1116            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1117        )
1118
1119    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1120    if not isinstance(snd, parselmouth.Sound):
1121        snd = get_sound(snd)
1122
1123    try:
1124        # Convert the sound to a point process for jitter measurement
1125        point_process = _to_point_process(snd, floor, ceiling)
1126
1127        # Extract jitter measures from the point process
1128        return {
1129            "local_jitter": _extract_jitter("local", point_process),
1130            "localabsolute_jitter": _extract_jitter("local, absolute", point_process),
1131            "rap_jitter": _extract_jitter("rap", point_process),
1132            "ppq5_jitter": _extract_jitter("ppq5", point_process),
1133            "ddp_jitter": _extract_jitter("ddp", point_process),
1134        }
1135
1136    except Exception as e:
1137        current_frame = inspect.currentframe()
1138        if current_frame is not None:
1139            current_function_name = current_frame.f_code.co_name
1140            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1141        return {
1142            "local_jitter": np.nan,
1143            "localabsolute_jitter": np.nan,
1144            "rap_jitter": np.nan,
1145            "ppq5_jitter": np.nan,
1146            "ddp_jitter": np.nan,
1147        }

Returns the jitter descriptors for the given sound or audio file.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
  • or an Audio object containing the audio waveform and its corresponding sampling rate.
  • floor (float): Minimum fundamental frequency (F0) in Hz.
  • ceiling (float): Maximum fundamental frequency (F0) in Hz.
Returns:

Dict[str, float]: A dictionary containing various jitter measurements.

def extract_shimmer( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
1150def extract_shimmer(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1151    """Returns the shimmer descriptors for the given sound or audio file.
1152
1153    Args:
1154        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1155        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1156        floor (float): Minimum fundamental frequency (F0) in Hz.
1157        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1158
1159    Returns:
1160        Dict[str, float]: A dictionary containing various shimmer measurements.
1161    """
1162
1163    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1164        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1165
1166    def _extract_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float:
1167        return parselmouth.praat.call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6)
1168
1169    if not PARSELMOUTH_AVAILABLE:
1170        raise ModuleNotFoundError(
1171            "`parselmouth` is not installed. "
1172            "Please install senselab audio dependencies using `pip install 'senselab[audio]'`."
1173        )
1174
1175    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1176    if not isinstance(snd, parselmouth.Sound):
1177        snd = get_sound(snd)
1178
1179    try:
1180        # Convert the sound to a point process for shimmer measurement
1181        point_process = _to_point_process(snd, floor, ceiling)
1182
1183        # Extract shimmer measures from the sound and point process
1184        return {
1185            "local_shimmer": _extract_shimmer("local", snd, point_process),
1186            "localDB_shimmer": _extract_shimmer("local_dB", snd, point_process),
1187            "apq3_shimmer": _extract_shimmer("apq3", snd, point_process),
1188            "apq5_shimmer": _extract_shimmer("apq5", snd, point_process),
1189            "apq11_shimmer": _extract_shimmer("apq11", snd, point_process),
1190            "dda_shimmer": _extract_shimmer("dda", snd, point_process),
1191        }
1192
1193    except Exception as e:
1194        current_frame = inspect.currentframe()
1195        if current_frame is not None:
1196            current_function_name = current_frame.f_code.co_name
1197            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1198        return {
1199            "local_shimmer": np.nan,
1200            "localDB_shimmer": np.nan,
1201            "apq3_shimmer": np.nan,
1202            "apq5_shimmer": np.nan,
1203            "apq11_shimmer": np.nan,
1204            "dda_shimmer": np.nan,
1205        }

Returns the shimmer descriptors for the given sound or audio file.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
  • or an Audio object containing the audio waveform and its corresponding sampling rate.
  • floor (float): Minimum fundamental frequency (F0) in Hz.
  • ceiling (float): Maximum fundamental frequency (F0) in Hz.
Returns:

Dict[str, float]: A dictionary containing various shimmer measurements.

def extract_praat_parselmouth_features_from_audios( audios: List[senselab.audio.data_structures.audio.Audio], time_step: float = 0.005, window_length: float = 0.025, pitch_unit: str = 'Hertz', cache_dir: Union[str, os.PathLike, NoneType] = None, speech_rate: bool = True, intensity_descriptors: bool = True, harmonicity_descriptors: bool = True, formants: bool = True, spectral_moments: bool = True, pitch: bool = True, slope_tilt: bool = True, cpp_descriptors: bool = True, duration: bool = True, jitter: bool = True, shimmer: bool = True, plugin: str = 'serial', plugin_args: Dict[str, Any] = {}) -> List[Dict[str, Any]]:
1209def extract_praat_parselmouth_features_from_audios(
1210    audios: List[Audio],
1211    time_step: float = 0.005,
1212    window_length: float = 0.025,
1213    pitch_unit: str = "Hertz",
1214    cache_dir: Optional[str | os.PathLike] = None,
1215    speech_rate: bool = True,
1216    intensity_descriptors: bool = True,
1217    harmonicity_descriptors: bool = True,
1218    formants: bool = True,
1219    spectral_moments: bool = True,
1220    pitch: bool = True,
1221    slope_tilt: bool = True,
1222    cpp_descriptors: bool = True,
1223    duration: bool = True,
1224    jitter: bool = True,
1225    shimmer: bool = True,
1226    plugin: str = "serial",
1227    plugin_args: Dict[str, Any] = {},
1228) -> List[Dict[str, Any]]:
1229    """Extract features from a list of Audio objects and return a JSON-like dictionary.
1230
1231    Args:
1232        audios (list): List of Audio objects to extract features from.
1233        pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
1234        time_step (float): Time rate at which to extract features. Defaults to 0.005.
1235        window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
1236        cache_dir (Optional[str]): Directory to use for caching by pydra. Defaults to None.
1237        speech_rate (bool): Whether to extract speech rate. Defaults to True.
1238        intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
1239        harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
1240        formants (bool): Whether to extract formants. Defaults to True.
1241        spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
1242        pitch (bool): Whether to extract pitch. Defaults to True.
1243        slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
1244        cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
1245        duration (bool): Whether to extract duration. Defaults to True.
1246        jitter (bool): Whether to extract jitter. Defaults to True.
1247        shimmer (bool): Whether to extract shimmer. Defaults to True.
1248        plugin (str): Plugin to use for feature extraction. Defaults to "serial".
1249        plugin_args (Optional[Dict[str, Any]]): Arguments for the pydra plugin. Defaults to {}.
1250
1251    Returns:
1252        dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".
1253    """
1254    # Mark tasks with Pydra
1255    extract_pitch_values_pt = pydra.mark.task(extract_pitch_values)
1256
1257    def _extract_pitch_floor(pitch_values_out: dict) -> float:
1258        return pitch_values_out["pitch_floor"]
1259
1260    _extract_pitch_floor_pt = pydra.mark.task(_extract_pitch_floor)
1261
1262    def _extract_pitch_ceiling(pitch_values_out: dict) -> float:
1263        return pitch_values_out["pitch_ceiling"]
1264
1265    _extract_pitch_ceiling_pt = pydra.mark.task(_extract_pitch_ceiling)
1266    if speech_rate:
1267        extract_speech_rate_pt = pydra.mark.task(extract_speech_rate)
1268    if intensity_descriptors:
1269        extract_intensity_descriptors_pt = pydra.mark.task(extract_intensity_descriptors)
1270    if harmonicity_descriptors:
1271        extract_harmonicity_descriptors_pt = pydra.mark.task(extract_harmonicity_descriptors)
1272    if formants:
1273        measure_f1f2_formants_bandwidths_pt = pydra.mark.task(measure_f1f2_formants_bandwidths)
1274    if spectral_moments:
1275        extract_spectral_moments_pt = pydra.mark.task(extract_spectral_moments)
1276    if pitch:
1277        extract_pitch_descriptors_pt = pydra.mark.task(extract_pitch_descriptors)
1278    if slope_tilt:
1279        extract_slope_tilt_pt = pydra.mark.task(extract_slope_tilt)
1280    if cpp_descriptors:
1281        extract_cpp_descriptors_pt = pydra.mark.task(extract_cpp_descriptors)
1282    if duration:
1283        extract_audio_duration_pt = pydra.mark.task(extract_audio_duration)
1284    if jitter:
1285        extract_jitter_pt = pydra.mark.task(extract_jitter)
1286    if shimmer:
1287        extract_shimmer_pt = pydra.mark.task(extract_shimmer)
1288
1289    # Create the workflow
1290    wf = pydra.Workflow(name="wf", input_spec=["x"], cache_dir=cache_dir)
1291    wf.split("x", x=audios)
1292    wf.add(extract_pitch_values_pt(name="extract_pitch_values_pt", snd=wf.lzin.x))
1293    wf.add(
1294        _extract_pitch_floor_pt(name="_extract_pitch_floor_pt", pitch_values_out=wf.extract_pitch_values_pt.lzout.out)
1295    )
1296    wf.add(
1297        _extract_pitch_ceiling_pt(
1298            name="_extract_pitch_ceiling_pt", pitch_values_out=wf.extract_pitch_values_pt.lzout.out
1299        )
1300    )
1301    if speech_rate:
1302        wf.add(extract_speech_rate_pt(name="extract_speech_rate_pt", snd=wf.lzin.x))
1303    if pitch:
1304        wf.add(
1305            extract_pitch_descriptors_pt(
1306                name="extract_pitch_descriptors_pt",
1307                snd=wf.lzin.x,
1308                floor=wf._extract_pitch_floor_pt.lzout.out,
1309                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1310                frame_shift=time_step,
1311                unit=pitch_unit,
1312            )
1313        )
1314    if intensity_descriptors:
1315        wf.add(
1316            extract_intensity_descriptors_pt(
1317                name="extract_intensity_descriptors_pt",
1318                snd=wf.lzin.x,
1319                floor=wf._extract_pitch_floor_pt.lzout.out,
1320                frame_shift=time_step,
1321            )
1322        )
1323    if harmonicity_descriptors:
1324        wf.add(
1325            extract_harmonicity_descriptors_pt(
1326                name="extract_harmonicity_descriptors_pt",
1327                snd=wf.lzin.x,
1328                floor=wf._extract_pitch_floor_pt.lzout.out,
1329                frame_shift=time_step,
1330            )
1331        )
1332    if formants:
1333        wf.add(
1334            measure_f1f2_formants_bandwidths_pt(
1335                name="measure_f1f2_formants_bandwidths_pt",
1336                snd=wf.lzin.x,
1337                floor=wf._extract_pitch_floor_pt.lzout.out,
1338                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1339                frame_shift=time_step,
1340            )
1341        )
1342    if spectral_moments:
1343        wf.add(
1344            extract_spectral_moments_pt(
1345                name="extract_spectral_moments_pt",
1346                snd=wf.lzin.x,
1347                floor=wf._extract_pitch_floor_pt.lzout.out,
1348                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1349                window_size=window_length,
1350                frame_shift=time_step,
1351            )
1352        )
1353    if slope_tilt:
1354        wf.add(
1355            extract_slope_tilt_pt(
1356                name="extract_slope_tilt_pt",
1357                snd=wf.lzin.x,
1358                floor=wf._extract_pitch_floor_pt.lzout.out,
1359                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1360            )
1361        )
1362    if cpp_descriptors:
1363        wf.add(
1364            extract_cpp_descriptors_pt(
1365                name="extract_cpp_descriptors_pt",
1366                snd=wf.lzin.x,
1367                floor=wf._extract_pitch_floor_pt.lzout.out,
1368                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1369                frame_shift=time_step,
1370            )
1371        )
1372    if duration:
1373        wf.add(extract_audio_duration_pt(name="extract_audio_duration_pt", snd=wf.lzin.x))
1374    if jitter:
1375        wf.add(
1376            extract_jitter_pt(
1377                name="extract_jitter_pt",
1378                snd=wf.lzin.x,
1379                floor=wf._extract_pitch_floor_pt.lzout.out,
1380                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1381            )
1382        )
1383    if shimmer:
1384        wf.add(
1385            extract_shimmer_pt(
1386                name="extract_shimmer_pt",
1387                snd=wf.lzin.x,
1388                floor=wf._extract_pitch_floor_pt.lzout.out,
1389                ceiling=wf._extract_pitch_ceiling_pt.lzout.out,
1390            )
1391        )
1392
1393    # setting multiple workflow outputs
1394    output_connections = [("pitch_values_out", wf.extract_pitch_values_pt.lzout.out)]
1395    if speech_rate:
1396        output_connections.append(("speech_rate_out", wf.extract_speech_rate_pt.lzout.out))
1397    if pitch:
1398        output_connections.append(("pitch_out", wf.extract_pitch_descriptors_pt.lzout.out))
1399    if intensity_descriptors:
1400        output_connections.append(("intensity_out", wf.extract_intensity_descriptors_pt.lzout.out))
1401    if harmonicity_descriptors:
1402        output_connections.append(("harmonicity_out", wf.extract_harmonicity_descriptors_pt.lzout.out))
1403    if formants:
1404        output_connections.append(("formants_out", wf.measure_f1f2_formants_bandwidths_pt.lzout.out))
1405    if spectral_moments:
1406        output_connections.append(("spectral_moments_out", wf.extract_spectral_moments_pt.lzout.out))
1407    if slope_tilt:
1408        output_connections.append(("slope_tilt_out", wf.extract_slope_tilt_pt.lzout.out))
1409    if cpp_descriptors:
1410        output_connections.append(("cpp_out", wf.extract_cpp_descriptors_pt.lzout.out))
1411    if duration:
1412        output_connections.append(("audio_duration", wf.extract_audio_duration_pt.lzout.out))
1413    if jitter:
1414        output_connections.append(("jitter_out", wf.extract_jitter_pt.lzout.out))
1415    if shimmer:
1416        output_connections.append(("shimmer_out", wf.extract_shimmer_pt.lzout.out))
1417    wf.set_output(output_connections)
1418
1419    with pydra.Submitter(plugin=plugin, **plugin_args) as sub:
1420        sub(wf)
1421
1422    outputs = wf.result()
1423
1424    extracted_data = []
1425
1426    for output in outputs:
1427        feature_data = {}
1428        # Audio duration
1429        if duration:
1430            feature_data["duration"] = output.output.audio_duration["duration"]
1431        # Timing and Pausing
1432        if speech_rate:
1433            feature_data["speaking_rate"] = output.output.speech_rate_out["speaking_rate"]
1434            feature_data["articulation_rate"] = output.output.speech_rate_out["articulation_rate"]
1435            feature_data["phonation_ratio"] = output.output.speech_rate_out["phonation_ratio"]
1436            feature_data["pause_rate"] = output.output.speech_rate_out["pause_rate"]
1437            feature_data["mean_pause_duration"] = output.output.speech_rate_out["mean_pause_dur"]
1438        # Pitch and Intensity:
1439        if pitch:
1440            feature_data[f"mean_f0_{pitch_unit.lower()}"] = output.output.pitch_out[f"mean_f0_{pitch_unit.lower()}"]
1441            feature_data[f"std_f0_{pitch_unit.lower()}"] = output.output.pitch_out[f"stdev_f0_{pitch_unit.lower()}"]
1442            feature_data["mean_intensity_db"] = output.output.intensity_out["mean_db"]
1443            feature_data["std_intensity_db"] = output.output.intensity_out["std_db"]
1444            feature_data["range_ratio_intensity_db"] = output.output.intensity_out["range_db_ratio"]
1445            # feature_data["pitch_floor"] = output.output.pitch_values_out["pitch_floor"]
1446            # feature_data["pitch_ceiling"] = output.output.pitch_values_out["pitch_ceiling"]
1447        # Quality Features:
1448        if harmonicity_descriptors:
1449            feature_data["mean_hnr_db"] = output.output.harmonicity_out["hnr_db_mean"]
1450            feature_data["std_hnr_db"] = output.output.harmonicity_out["hnr_db_std_dev"]
1451            feature_data["spectral_slope"] = output.output.slope_tilt_out["spectral_slope"]
1452            feature_data["spectral_tilt"] = output.output.slope_tilt_out["spectral_tilt"]
1453            feature_data["cepstral_peak_prominence_mean"] = output.output.cpp_out["mean_cpp"]
1454            feature_data["cepstral_peak_prominence_std"] = output.output.cpp_out["std_dev_cpp"]
1455        # Formant (F1, F2):
1456        if formants:
1457            feature_data["mean_f1_loc"] = output.output.formants_out["f1_mean"]
1458            feature_data["std_f1_loc"] = output.output.formants_out["f1_std"]
1459            feature_data["mean_b1_loc"] = output.output.formants_out["b1_mean"]
1460            feature_data["std_b1_loc"] = output.output.formants_out["b1_std"]
1461            feature_data["mean_f2_loc"] = output.output.formants_out["f2_mean"]
1462            feature_data["std_f2_loc"] = output.output.formants_out["f2_std"]
1463            feature_data["mean_b2_loc"] = output.output.formants_out["b2_mean"]
1464            feature_data["std_b2_loc"] = output.output.formants_out["b2_std"]
1465        # Spectral Moments:
1466        if spectral_moments:
1467            feature_data["spectral_gravity"] = output.output.spectral_moments_out["spectral_gravity"]
1468            feature_data["spectral_std_dev"] = output.output.spectral_moments_out["spectral_std_dev"]
1469            feature_data["spectral_skewness"] = output.output.spectral_moments_out["spectral_skewness"]
1470            feature_data["spectral_kurtosis"] = output.output.spectral_moments_out["spectral_kurtosis"]
1471        # Jitter Descriptors:
1472        if jitter:
1473            feature_data["local_jitter"] = output.output.jitter_out["local_jitter"]
1474            feature_data["localabsolute_jitter"] = output.output.jitter_out["localabsolute_jitter"]
1475            feature_data["rap_jitter"] = output.output.jitter_out["rap_jitter"]
1476            feature_data["ppq5_jitter"] = output.output.jitter_out["ppq5_jitter"]
1477            feature_data["ddp_jitter"] = output.output.jitter_out["ddp_jitter"]
1478        # Shimmer Descriptors:
1479        if shimmer:
1480            feature_data["local_shimmer"] = output.output.shimmer_out["local_shimmer"]
1481            feature_data["localDB_shimmer"] = output.output.shimmer_out["localDB_shimmer"]
1482            feature_data["apq3_shimmer"] = output.output.shimmer_out["apq3_shimmer"]
1483            feature_data["apq5_shimmer"] = output.output.shimmer_out["apq5_shimmer"]
1484            feature_data["apq11_shimmer"] = output.output.shimmer_out["apq11_shimmer"]
1485            feature_data["dda_shimmer"] = output.output.shimmer_out["dda_shimmer"]
1486
1487        extracted_data.append(feature_data)
1488
1489    return extracted_data

Extract features from a list of Audio objects and return a JSON-like dictionary.

Arguments:
  • audios (list): List of Audio objects to extract features from.
  • pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
  • time_step (float): Time rate at which to extract features. Defaults to 0.005.
  • window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
  • cache_dir (Optional[str]): Directory to use for caching by pydra. Defaults to None.
  • speech_rate (bool): Whether to extract speech rate. Defaults to True.
  • intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
  • harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
  • formants (bool): Whether to extract formants. Defaults to True.
  • spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
  • pitch (bool): Whether to extract pitch. Defaults to True.
  • slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
  • cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
  • duration (bool): Whether to extract duration. Defaults to True.
  • jitter (bool): Whether to extract jitter. Defaults to True.
  • shimmer (bool): Whether to extract shimmer. Defaults to True.
  • plugin (str): Plugin to use for feature extraction. Defaults to "serial".
  • plugin_args (Optional[Dict[str, Any]]): Arguments for the pydra plugin. Defaults to {}.
Returns:

dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".