senselab.audio.tasks.features_extraction.praat_parselmouth

This module contains functions that extract features from audio files using the PRAAT library.

The initial implementation of this features extraction was started by Nicholas Cummins from King's College London and has since been further developed and maintained by the senselab community.

   1"""This module contains functions that extract features from audio files using the PRAAT library.
   2
   3The initial implementation of this features extraction was started by Nicholas Cummins
   4from King's College London and has since been further developed and maintained
   5by the senselab community.
   6"""
   7
   8import inspect
   9import os
  10from pathlib import Path
  11from typing import Any, Dict, List, Optional, Sequence, Union
  12
  13import numpy as np
  14
  15from senselab.audio.data_structures import Audio
  16from senselab.utils.data_structures import logger
  17
  18try:
  19    import parselmouth  # type: ignore
  20
  21    PARSELMOUTH_AVAILABLE = True
  22except ModuleNotFoundError:
  23    PARSELMOUTH_AVAILABLE = False
  24
  25    class DummyParselmouth:
  26        """Dummy class for when parselmouth is not available.
  27
  28        This is helpful for type checking when parselmouth is not installed.
  29        """
  30
  31        def __init__(self) -> None:
  32            """Dummy constructor for when parselmouth is not available."""
  33            pass
  34
  35        def call(self, *args: object, **kwargs: object) -> None:  # type: ignore
  36            """Dummy method for when parselmouth is not available."""
  37
  38        class Sound:
  39            """Dummy class for when parselmouth is not available."""
  40
  41            def __init__(self, *args: object, **kwargs: object) -> None:
  42                """Dummy class for when parselmouth is not available."""
  43                pass
  44
  45    parselmouth = DummyParselmouth()
  46
  47
  48def get_sound(audio: Union[Path, Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
  49    """Get a sound object from a given audio file or Audio object.
  50
  51    Args:
  52        audio (Union[Path, Audio]): A path to an audio file or an Audio object.
  53        sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
  54
  55    Returns:
  56        parselmouth.Sound: A Parselmouth Sound object.
  57
  58    Raises:
  59        FileNotFoundError: If the file is not found at the given path.
  60    """
  61    if not PARSELMOUTH_AVAILABLE:
  62        raise ModuleNotFoundError(
  63            "`parselmouth` is not installed. "
  64            "Please install senselab audio dependencies using `pip install senselab`."
  65        )
  66
  67    try:
  68        # Loading the sound
  69        if isinstance(audio, Path):
  70            audio = audio.resolve()
  71            if not audio.exists():
  72                logger.error(f"File does not exist: {audio}")
  73                raise FileNotFoundError(f"File does not exist: {audio}")
  74            snd_full = parselmouth.Sound(str(audio))
  75        elif isinstance(audio, Audio):
  76            snd_full = parselmouth.Sound(audio.waveform, audio.sampling_rate)
  77
  78        # Preprocessing
  79        if parselmouth.praat.call(snd_full, "Get number of channels") > 1:
  80            snd_full = snd_full.convert_to_mono()
  81        if parselmouth.praat.call(snd_full, "Get sampling frequency") != sampling_rate:
  82            snd_full = parselmouth.praat.call(snd_full, "Resample", sampling_rate, 50)
  83            # Details of queery: https://www.fon.hum.uva.nl/praat/manual/Get_sampling_frequency.html
  84            # Details of conversion: https://www.fon.hum.uva.nl/praat/manual/Sound__Resample___.html
  85    except Exception as e:
  86        raise RuntimeError(f"Error loading sound: {e}")
  87    return snd_full
  88
  89
  90def extract_speech_rate(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
  91    """Extract speech timing and pausing features from a given sound object.
  92
  93    Args:
  94        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  95
  96    Returns:
  97        Dict[str, float]: A dictionary containing the following features:
  98
  99            - speaking_rate (float): Number of syllables divided by duration.
 100            - articulation_rate (float): Number of syllables divided by phonation time.
 101            - phonation_ratio (float): Phonation time divided by duration.
 102            - pause_rate (float): Number of pauses divided by duration.
 103            - mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
 104
 105    Examples:
 106        ```python
 107        >>> snd = parselmouth.Sound("path_to_audio.wav")
 108        >>> extract_speech_rate(snd)
 109        {
 110            'speaking_rate': 5.3,
 111            'articulation_rate': 4.7,
 112            'phonation_ratio': 0.9,
 113            'pause_rate': 2.1,
 114            'mean_pause_dur': 0.5
 115        }
 116        ```
 117
 118    Useful sources for this code:
 119
 120        - https://sites.google.com/view/uhm-o-meter/scripts/syllablenuclei_v3?pli=1
 121        - https://drive.google.com/file/d/1o3mNdN5FKTiYQC9GHB1XoZ8JJIGZk_AK/view
 122        - (2009 paper) https://doi.org/10.3758/BRM.41.2.385
 123        - (2021 paper) https://doi.org/10.1080/0969594X.2021.1951162
 124    """
 125    if not PARSELMOUTH_AVAILABLE:
 126        raise ModuleNotFoundError(
 127            "`parselmouth` is not installed. "
 128            "Please install senselab audio dependencies using `pip install senselab`."
 129        )
 130
 131    try:
 132        # _____________________________________________________________________________________________________________
 133        # Load the sound object into parselmouth if it is an Audio object
 134        if not isinstance(snd, parselmouth.Sound):
 135            snd = get_sound(snd)
 136
 137        # _____________________________________________________________________________________________________________
 138        # Key pause detection hyperparameters
 139
 140        # Silence Threshold (dB) - standard setting to detect silence in the "To TextGrid (silences)" function.
 141        # The higher this number, the lower the chances of finding silent pauses
 142        silence_db = -25
 143
 144        # Minimum_dip_between_peaks_(dB) - if there are decreases in intensity
 145        # of at least this value surrounding the peak, the peak is labelled to be a syllable nucleus
 146        # I.e. the size of the dip between two possible peakes
 147        # The higher this number, the less syllables will be found
 148        # For clean and filtered signal use 4, if not use 2 (recommend thresholds)
 149        min_dip = 4
 150        # Code for determining if the signal not clean/filtered
 151        hnr = parselmouth.praat.call(
 152            snd.to_harmonicity_cc(), "Get mean", 0, 0
 153        )  # Note: (0,0) is the time range for extraction, setting both two zero tells praat to use the full file
 154        if hnr < 60:
 155            min_dip = 2
 156
 157        # Minimum pause duration (s): How long should a pause be to be counted as a silent pause?
 158        # The higher this number, the fewer pauses will be found
 159        min_pause = 0.3  # the default for this is 0.1 in Praat, the de Jong's script has this set at 0.3
 160        # Based on values in: Toward an understanding of fluency:
 161        # A microanalysis of nonnative speaker conversations (Riggenbach)
 162        # – Micropause (silence of .2s or less)
 163        # – Hesitation (silence of .3 to .4s)
 164        # – Unfilled pause (silence of .5s or more)
 165
 166        # ______________________________________________________________________________________________________________
 167        # Intensity information
 168
 169        intensity = snd.to_intensity(minimum_pitch=50, time_step=0.016, subtract_mean=True)
 170        # These are the setting recommended by de jong - minimum pitch” set to 50 Hz,.
 171        # With this parameter setting, we extract intensity smoothed over a time window of (3.2/minimum_pitch)=64 msec,
 172        #  with 16-msec time steps explanation on these calculations are found at:
 173        # https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 174
 175        min_intensity = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "Parabolic")  # time range, Interpolation
 176        max_intensity = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "Parabolic")  # time range, Interpolation
 177
 178        # Silince is detected by measuring whether the intensity is 25 dB below the 99% highest peak
 179        # 99% is chosen to eliminate short loud bursts in intensity that may not have been speech
 180
 181        # get .99 quantile to get maximum (without influence of non-speech sound bursts)
 182        max_99_intensity = parselmouth.praat.call(intensity, "Get quantile", 0, 0, 0.99)
 183
 184        # estimate Intensity threshold
 185        silence_db_1 = max_99_intensity + silence_db
 186        db_adjustment = max_intensity - max_99_intensity
 187        silence_db_2 = silence_db - db_adjustment
 188        if silence_db_1 < min_intensity:
 189            silence_db_1 = min_intensity
 190
 191        # ______________________________________________________________________________________________________________
 192        # Create a TextGrid in which the silent and sounding intervals, store these intervals
 193
 194        textgrid = parselmouth.praat.call(
 195            intensity, "To TextGrid (silences)", silence_db_2, min_pause, 0.1, "silent", "sounding"
 196        )
 197        # Hyperparameters:
 198        # Silence threshold (dB),
 199        # Minimum silent interval (s) - minimum duration for an interval to be considered as silent
 200        # Minimum sounding interval (s) - minimum duration for an interval to be not considered as silent
 201        # Silent interval label
 202        # Sounding interval label
 203
 204        # Loop through intervals and extract times of identified silent and sounding sections
 205        silencetier = parselmouth.praat.call(textgrid, "Extract tier", 1)
 206        silencetable = parselmouth.praat.call(silencetier, "Down to TableOfReal", "sounding")
 207        npauses = parselmouth.praat.call(silencetable, "Get number of rows")
 208
 209        phonation_time = 0
 210        for ipause in range(npauses):
 211            pause = ipause + 1
 212            beginsound = parselmouth.praat.call(silencetable, "Get value", pause, 1)
 213            endsound = parselmouth.praat.call(silencetable, "Get value", pause, 2)
 214            speakingdur = endsound - beginsound
 215
 216            phonation_time += speakingdur
 217
 218            # This is to remove the first (before first word) and last (after last word) silence from consideration
 219            if pause == 1:
 220                begin_speak = beginsound
 221            if pause == (npauses):
 222                end_speak = endsound
 223
 224        # ______________________________________________________________________________________________________________
 225        # Next block of code finds all possible peaks
 226
 227        # Convert intensity countor into sound representation
 228        intensity_matrix = parselmouth.praat.call(intensity, "Down to Matrix")  # convert intensity to 2d representation
 229
 230        # Convert intensity countor into sound representation
 231        sound_from_intensity_matrix = parselmouth.praat.call(intensity_matrix, "To Sound (slice)", 1)
 232
 233        # find positive extrema, maxima in sound_from_intensity_matrix, which correspond to steepest rises in Intensity;
 234        point_process = parselmouth.praat.call(
 235            sound_from_intensity_matrix,
 236            "To PointProcess (extrema)",
 237            "Left",
 238            "yes",
 239            "no",
 240            "Sinc70",
 241        )
 242
 243        # estimate peak positions (all peaks)
 244        t = []
 245        numpeaks = parselmouth.praat.call(point_process, "Get number of points")
 246        for i in range(numpeaks):
 247            t.append(parselmouth.praat.call(point_process, "Get time from index", i + 1))
 248
 249        # ______________________________________________________________________________________________________________
 250        # Find the time and values of all peaks
 251
 252        # fill array with intensity values
 253        timepeaks = []
 254        peakcount = 0
 255        intensities = []
 256        for i in range(numpeaks):
 257            value = parselmouth.praat.call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
 258            if value > silence_db_1:
 259                peakcount += 1
 260                intensities.append(value)
 261                timepeaks.append(t[i])
 262
 263        # ______________________________________________________________________________________________________________
 264        # Now find all valid peaks
 265
 266        # fill array with valid peaks: only intensity values if preceding
 267        # dip in intensity is greater than min_dip
 268        validpeakcount = 0
 269        currenttime = timepeaks[0]
 270        currentint = intensities[0]
 271        validtime = []
 272
 273        for p in range(peakcount - 1):
 274            following = p + 1
 275            followingtime = timepeaks[following]
 276            dip = parselmouth.praat.call(
 277                intensity, "Get minimum", currenttime, followingtime, "None"
 278            )  # Gets minimiun value between two time points, doesn't intepolote/filter
 279            diffint = abs(currentint - dip)
 280            if diffint > min_dip:
 281                validpeakcount += 1
 282                validtime.append(timepeaks[p])
 283            # Update current time and intensity values for next loop
 284            currenttime = timepeaks[following]
 285            currentint = parselmouth.praat.call(intensity, "Get value at time", timepeaks[following], "Cubic")
 286
 287        # ______________________________________________________________________________________________________________
 288        # Extract voicing information
 289
 290        pitch = snd.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
 291        # Praat page for hyperparamters https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 292        # From de Jong's 2009 paper - We extract the pitch contour, this time using a window size of 100 msec
 293        # and 20-msec time steps, and exclude all peaks that are unvoiced
 294        # Key Hyperparamter are different to praat recommended - can't find a reason for this
 295        # time_step: Optional[Positive[float]] = None,  - set per De jong's recommendation
 296        # pitch_floor: Positive[float] = 75.0 set per dejong recommendation - 3/30 gives 100ms
 297        # max_number_of_candidates: Positive[int] = 15 (can't find a reason for this value being lower)
 298        # very_accurate: bool = False,
 299        # silence_threshold: float = 0.03,
 300        # voicing_threshold: float = 0.45, (can't find a reason for this value being different)
 301        # octave_cost: float = 0.01,
 302        # octave_jump_cost: float = 0.35,
 303        # voiced_unvoiced_cost: float = 0.14, (can't find a reason for this value being different)
 304        # pitch_ceiling: Positive[float] = 600.0 (can't find a reason for this value being lower, might change to value
 305        # from pitch_value function)
 306
 307        # ______________________________________________________________________________________________________________
 308        # Loop through valid peaks, count ones that are voiced (i.e., have valid pitch value at the same time)
 309
 310        number_syllables = int(0)
 311        for time in range(validpeakcount):
 312            querytime = validtime[time]
 313            whichinterval = parselmouth.praat.call(textgrid, "Get interval at time", 1, querytime)
 314            whichlabel = parselmouth.praat.call(textgrid, "Get label of interval", 1, whichinterval)
 315            value = pitch.get_value_at_time(querytime)
 316            if not np.isnan(value):
 317                if whichlabel == "sounding":
 318                    number_syllables += 1
 319
 320        # ______________________________________________________________________________________________________________
 321        # return results
 322
 323        original_dur = end_speak - begin_speak
 324
 325        speaking_rate = number_syllables / original_dur
 326        articulation_rate = number_syllables / phonation_time
 327        phonation_ratio = phonation_time / original_dur
 328
 329        number_pauses = npauses - 1
 330        pause_time = original_dur - phonation_time
 331
 332        pause_rate = number_pauses / original_dur
 333        mean_pause_dur = pause_time / number_pauses if number_pauses > 0 else 0.0
 334
 335        return {
 336            "speaking_rate": speaking_rate,
 337            "articulation_rate": articulation_rate,
 338            "phonation_ratio": phonation_ratio,
 339            "pause_rate": pause_rate,
 340            "mean_pause_dur": mean_pause_dur,
 341        }
 342
 343    except Exception as e:
 344        current_frame = inspect.currentframe()
 345        if current_frame is not None:
 346            current_function_name = current_frame.f_code.co_name
 347            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 348        return {
 349            "speaking_rate": np.nan,
 350            "articulation_rate": np.nan,
 351            "phonation_ratio": np.nan,
 352            "pause_rate": np.nan,
 353            "mean_pause_dur": np.nan,
 354        }
 355
 356
 357def extract_pitch_values(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
 358    """Estimate Pitch Range.
 359
 360    Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction
 361    algorithms.
 362
 363    Args:
 364        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 365
 366    Returns:
 367        dict: A dictionary containing the following keys:
 368
 369            - pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
 370            - pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
 371
 372    Notes:
 373        Values are taken from: [Standardization of pitch-range settings in voice acoustic analysis](https://doi.org/10.3758/BRM.41.2.318)
 374
 375        The problem observed with doing a really broad pitch search was the occasional error if F1 was low.
 376        So crude outlier detection is used to help with this.
 377
 378        Important: These values are used within other functions, they are not outputs of the full code.
 379
 380        Different pitch extraction methods in Praat:
 381
 382        - Cross-correlation (Praat default) vs auto-correlation pitch extraction:
 383        both are used in different functions below.
 384        - Cross-correlation is better than auto-correlation at finding period-level variation,
 385        such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
 386        - [Discussion on this on a Praat Forum](https://groups.io/g/Praat-Users-List/topic/pitch_detection_ac_vs_cc/78829266?p=,,,20,0,0,0::recentpostdate/sticky,,,20,2,20,78829266,previd=1612369050729515119,nextid=1605568402827788039&previd=1612369050729515119&nextid=1605568402827788039)
 387
 388    Examples:
 389        ```python
 390        >>> snd = parselmouth.Sound("path_to_audio.wav")
 391        >>> pitch_values(snd)
 392        {'pitch_floor': 60, 'pitch_ceiling': 250}
 393        ```
 394    """
 395    if not PARSELMOUTH_AVAILABLE:
 396        raise ModuleNotFoundError(
 397            "`parselmouth` is not installed. "
 398            "Please install senselab audio dependencies using `pip install senselab`."
 399        )
 400
 401    try:
 402        if not isinstance(snd, parselmouth.Sound):
 403            snd = get_sound(snd)
 404
 405        pitch_wide = snd.to_pitch_ac(time_step=0.005, pitch_floor=50, pitch_ceiling=600)
 406        # Other than values above, I'm using default hyperparamters
 407        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 408
 409        # remove outliers from wide pitch search
 410        pitch_values = pitch_wide.selected_array["frequency"]
 411        pitch_values = pitch_values[pitch_values != 0]
 412        pitch_values_Z = (pitch_values - np.mean(pitch_values)) / np.std(pitch_values)
 413        pitch_values_filtered = pitch_values[abs(pitch_values_Z) <= 2]
 414
 415        mean_pitch = np.mean(pitch_values_filtered)
 416
 417        # Here there is an interesting alternative solution to discuss: https://praatscripting.lingphon.net/conditionals-1.html
 418        if mean_pitch < 170:
 419            # 'male' settings
 420            pitch_floor = 60.0
 421            pitch_ceiling = 250.0
 422        else:
 423            # 'female' and 'child' settings
 424            pitch_floor = 100.0
 425            pitch_ceiling = 500.0
 426
 427        return {"pitch_floor": pitch_floor, "pitch_ceiling": pitch_ceiling}
 428    except Exception as e:
 429        current_frame = inspect.currentframe()
 430        if current_frame is not None:
 431            current_function_name = current_frame.f_code.co_name
 432            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 433        return {"pitch_floor": np.nan, "pitch_ceiling": np.nan}
 434
 435
 436def extract_pitch_descriptors(
 437    snd: Union[parselmouth.Sound, Path, Audio],
 438    floor: float,
 439    ceiling: float,
 440    frame_shift: float = 0.005,
 441    unit: str = "Hertz",
 442) -> Dict[str, float]:
 443    """Extract Pitch Features.
 444
 445    Function to extract key pitch features from a given sound object.
 446    This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.
 447
 448    Args:
 449        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 450        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 451        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 452        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 453            Defaults to 0.005.
 454        unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz".
 455            Could be "semitones".
 456
 457    Returns:
 458        dict: A dictionary containing the following keys:
 459
 460            - mean_f0_{unit} (float): Mean pitch in {unit}.
 461            - stdev_f0_{unit} (float): Standard deviation in {unit}.
 462
 463    Notes:
 464        - Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
 465        - stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
 466
 467    Examples:
 468        ```python
 469        >>> snd = parselmouth.Sound("path_to_audio.wav")
 470        >>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
 471        {'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
 472        ```
 473    """
 474    if not PARSELMOUTH_AVAILABLE:
 475        raise ModuleNotFoundError(
 476            "`parselmouth` is not installed. "
 477            "Please install senselab audio dependencies using `pip install senselab`."
 478        )
 479
 480    try:
 481        if not isinstance(snd, parselmouth.Sound):
 482            snd = get_sound(snd)
 483
 484        # Extract pitch object
 485        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 486        # Other than values above, I'm using default hyperparameters
 487        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
 488
 489        # Extract mean, median, and standard deviation
 490        mean_f0 = parselmouth.praat.call(pitch, "Get mean", 0, 0, unit)  # time range, units
 491        stdev_f0 = parselmouth.praat.call(pitch, "Get standard deviation", 0, 0, unit)
 492
 493        # Return results
 494        return {f"mean_f0_{unit.lower()}": mean_f0, f"stdev_f0_{unit.lower()}": stdev_f0}
 495    except Exception as e:
 496        current_frame = inspect.currentframe()
 497        if current_frame is not None:
 498            current_function_name = current_frame.f_code.co_name
 499            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 500        return {f"mean_f0_{unit.lower()}": np.nan, f"stdev_f0_{unit.lower()}": np.nan}
 501
 502
 503def extract_intensity_descriptors(
 504    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
 505) -> Dict[str, float]:
 506    """Extract Intensity Features.
 507
 508    Function to extract key intensity information from a given sound object.
 509    This function is based on default Praat code adapted to work with Parselmouth.
 510
 511    Args:
 512        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 513        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 514        frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
 515
 516    Returns:
 517        dict: A dictionary containing the following keys:
 518
 519            - mean_db (float): Mean intensity in dB.
 520            - std_db (float): Standard deviation in dB.
 521            - range_db_ratio (float): Intensity range, expressed as a ratio in dB.
 522
 523    Examples:
 524        ```python
 525        >>> snd = parselmouth.Sound("path_to_audio.wav")
 526        >>> extract_intensity_descriptors(snd, 75, 0.01)
 527        {'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
 528        ```
 529
 530    Notes:
 531        - Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 532        - For notes on extracting mean settings: https://www.fon.hum.uva.nl/praat/manual/Intro_6_2__Configuring_the_intensity_contour.html
 533    """
 534    if not PARSELMOUTH_AVAILABLE:
 535        raise ModuleNotFoundError(
 536            "`parselmouth` is not installed. "
 537            "Please install senselab audio dependencies using `pip install senselab`."
 538        )
 539
 540    try:
 541        if not isinstance(snd, parselmouth.Sound):
 542            snd = get_sound(snd)
 543
 544        # Extract intensity object
 545        intensity = snd.to_intensity(minimum_pitch=floor, time_step=frame_shift, subtract_mean=True)
 546        # Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
 547
 548        # Extract descriptors
 549        mean_db = parselmouth.praat.call(
 550            intensity, "Get mean", 0, 0, "energy"
 551        )  # get mean - time range, time range, averaging method
 552        std_db = parselmouth.praat.call(intensity, "Get standard deviation", 0, 0)
 553        min_dB = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "parabolic")  # time range, Interpolation
 554        max_dB = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "parabolic")  # time range, Interpolation
 555        range_db_ratio = max_dB / min_dB
 556
 557        # Return results
 558        return {"mean_db": mean_db, "std_db": std_db, "range_db_ratio": range_db_ratio}
 559
 560    except Exception as e:
 561        current_frame = inspect.currentframe()
 562        if current_frame is not None:
 563            current_function_name = current_frame.f_code.co_name
 564            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 565        return {"mean_db": np.nan, "std_db": np.nan, "range_db_ratio": np.nan}
 566
 567
 568def extract_harmonicity_descriptors(
 569    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
 570) -> Dict[str, float]:
 571    """Voice Quality - HNR.
 572
 573    Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object.
 574    This function uses the CC method as recommended by Praat.
 575
 576    Args:
 577        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 578        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 579        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 580
 581    Returns:
 582        dict: A dictionary containing the following key:
 583
 584            - hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
 585            - hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
 586
 587    Examples:
 588        ```python
 589        >>> snd = parselmouth.Sound("path_to_audio.wav")
 590        >>> extract_harmonicity_descriptors(snd, 75, 0.01)
 591        {'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
 592        ```
 593
 594    Notes:
 595        - Praat recommends using the CC method: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
 596        - Default settings can be found at: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__ac____.html
 597    """
 598    if not PARSELMOUTH_AVAILABLE:
 599        raise ModuleNotFoundError(
 600            "`parselmouth` is not installed. "
 601            "Please install senselab audio dependencies using `pip install senselab`."
 602        )
 603
 604    try:
 605        if not isinstance(snd, parselmouth.Sound):
 606            snd = get_sound(snd)
 607
 608        # Extract HNR information
 609        harmonicity = snd.to_harmonicity_cc(
 610            time_step=frame_shift, minimum_pitch=floor, silence_threshold=0.1, periods_per_window=4.5
 611        )
 612        # Praat recommends using the CC method here: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
 613
 614        hnr_db_mean = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)
 615        hnr_db_std_dev = parselmouth.praat.call(harmonicity, "Get standard deviation", 0, 0)
 616
 617        return {"hnr_db_mean": hnr_db_mean, "hnr_db_std_dev": hnr_db_std_dev}
 618    except Exception as e:
 619        current_frame = inspect.currentframe()
 620        if current_frame is not None:
 621            current_function_name = current_frame.f_code.co_name
 622            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 623
 624        return {"hnr_db_mean": np.nan, "hnr_db_std_dev": np.nan}
 625
 626
 627def extract_slope_tilt(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
 628    """Voice Quality - Spectral Slope/Tilt.
 629
 630    Function to extract spectral slope and tilt from a given sound object. This function is based on default
 631    Praat code adapted to work with Parselmouth.
 632
 633    Args:
 634        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 635        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 636        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 637
 638    Returns:
 639        dict: A dictionary containing the following keys:
 640
 641            - spectral_slope (float): Mean spectral slope.
 642            - spectral_tilt (float): Mean spectral tilt.
 643
 644    Examples:
 645        ```python
 646        >>> snd = parselmouth.Sound("path_to_audio.wav")
 647        >>> extract_slope_tilt(snd, 75, 500)
 648        {'spectral_slope': -0.8, 'spectral_tilt': -2.5}
 649        ```
 650
 651    Notes:
 652        - Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
 653        - Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
 654        - Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation:
 655        https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
 656    """
 657    if not PARSELMOUTH_AVAILABLE:
 658        raise ModuleNotFoundError(
 659            "`parselmouth` is not installed. "
 660            "Please install senselab audio dependencies using `pip install senselab`."
 661        )
 662
 663    try:
 664        if not isinstance(snd, parselmouth.Sound):
 665            snd = get_sound(snd)
 666
 667        ltas_rep = parselmouth.praat.call(
 668            snd, "To Ltas (pitch-corrected)...", floor, ceiling, 5000, 100, 0.0001, 0.02, 1.3
 669        )
 670        # Hyperparameters: Min Pitch (Hz), Max Pitch (Hz), Maximum Frequency (Hz), Bandwidth (Hz), Shortest Period (s),
 671        # Longest Period (s), Maximum period factor
 672
 673        spectral_slope = parselmouth.praat.call(ltas_rep, "Get slope", 50, 1000, 1000, 4000, "dB")
 674        # Hyperparameters: f1min, f1max, f2min, f2max, averagingUnits
 675
 676        spectral_tilt_Report = parselmouth.praat.call(ltas_rep, "Report spectral tilt", 100, 5000, "Linear", "Robust")
 677        # Hyperparameters: minimumFrequency, maximumFrequency, Frequency Scale (linear or logarithmic),
 678        # Fit method (least squares or robust)
 679
 680        srt_st = spectral_tilt_Report.index("Slope: ") + len("Slope: ")
 681        end_st = spectral_tilt_Report.index("d", srt_st)
 682        spectral_tilt = float(spectral_tilt_Report[srt_st:end_st])
 683
 684        # Return results
 685        return {"spectral_slope": spectral_slope, "spectral_tilt": spectral_tilt}
 686
 687    except Exception as e:
 688        current_frame = inspect.currentframe()
 689        if current_frame is not None:
 690            current_function_name = current_frame.f_code.co_name
 691            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 692        return {"spectral_slope": np.nan, "spectral_tilt": np.nan}
 693
 694
 695def extract_cpp_descriptors(
 696    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
 697) -> Dict[str, float]:
 698    """Extract Cepstral Peak Prominence (CPP).
 699
 700    Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object.
 701    This function is adapted from default Praat code to work with Parselmouth.
 702
 703    Args:
 704        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 705        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 706        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 707        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 708
 709    Returns:
 710        dict: A dictionary containing the following key:
 711
 712            - mean_cpp (float): Mean Cepstral Peak Prominence.
 713            - std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
 714
 715    Examples:
 716        ```python
 717        >>> snd = parselmouth.Sound("path_to_audio.wav")
 718        >>> extract_CPP(snd, 75, 500, 0.01)
 719        {'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
 720        ```
 721
 722    Notes:
 723        - Cepstral Peak Prominence: The height (i.e., “prominence”) of that peak relative to a regression line
 724        through the overall cepstrum.
 725        - Adapted from: https://osf.io/ctwgr and http://phonetics.linguistics.ucla.edu/facilities/acoustic/voiced_extract_auto.txt
 726    """
 727    if not PARSELMOUTH_AVAILABLE:
 728        raise ModuleNotFoundError(
 729            "`parselmouth` is not installed. "
 730            "Please install senselab audio dependencies using `pip install senselab`."
 731        )
 732
 733    try:
 734        if not isinstance(snd, parselmouth.Sound):
 735            snd = get_sound(snd)
 736
 737        # Extract pitch object for voiced checking
 738        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling, voicing_threshold=0.3)
 739
 740        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
 741
 742        textgrid = parselmouth.praat.call(pulses, "To TextGrid (vuv)", 0.02, 0.1)
 743
 744        vuv_table = parselmouth.praat.call(textgrid, "Down to Table", "no", 6, "yes", "no")
 745        # Variables - include line number, Time decimals, include tier names, include empty intervals
 746
 747        cpp_list = []
 748
 749        n_intervals = parselmouth.praat.call(vuv_table, "Get number of rows")
 750        for i in range(n_intervals):
 751            label = parselmouth.praat.call(vuv_table, "Get value", i + 1, "text")
 752            if label == "V":
 753                tmin = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmin")
 754                tmax = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmax")
 755                snd_segment = snd.extract_part(float(tmin), float(tmax))
 756
 757                PowerCepstrogram = parselmouth.praat.call(snd_segment, "To PowerCepstrogram", 60, 0.002, 5000, 50)
 758                # PowerCepstrogram (60-Hz pitch floor, 2-ms time step, 5-kHz maximum frequency,
 759                # and pre-emphasis from 50 Hz)
 760
 761                try:
 762                    CPP_Value = parselmouth.praat.call(
 763                        PowerCepstrogram,
 764                        "Get CPPS...",
 765                        "no",
 766                        0.01,
 767                        0.001,
 768                        60,
 769                        330,
 770                        0.05,
 771                        "parabolic",
 772                        0.001,
 773                        0,
 774                        "Straight",
 775                        "Robust",
 776                    )
 777                    # Subtract tilt before smoothing = “no”; time averaging window = 0.01 s;
 778                    # quefrency averaging window = 0.001 s;
 779                    # Peak search pitch range = 60–330 Hz; tolerance = 0.05; interpolation = “Parabolic”;
 780                    # tilt line frequency range = 0.001–0 s (no upper bound);
 781                    # Line type = “Straight”; fit method = “Robust.”
 782                except Exception as e:
 783                    current_frame = inspect.currentframe()
 784                    if current_frame is not None:
 785                        current_function_name = current_frame.f_code.co_name
 786                        logger.error(f'Error in "{current_function_name}": \n' + str(e))
 787                    CPP_Value = np.nan
 788
 789                if not np.isnan(CPP_Value) and CPP_Value > 4:
 790                    cpp_list.append(CPP_Value)
 791
 792        # Calculate Final Features
 793        if cpp_list:
 794            CPP_array = np.array(cpp_list)
 795            CPP_mean = np.mean(CPP_array)
 796            CPP_std = np.std(CPP_array)
 797        else:
 798            CPP_mean = np.nan
 799            CPP_std = np.nan
 800
 801        # Return Result
 802        return {"mean_cpp": CPP_mean, "std_dev_cpp": CPP_std}
 803
 804    except Exception as e:
 805        current_frame = inspect.currentframe()
 806        if current_frame is not None:
 807            current_function_name = current_frame.f_code.co_name
 808            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 809        return {"mean_cpp": np.nan, "std_dev_cpp": np.nan}
 810
 811
 812def measure_f1f2_formants_bandwidths(
 813    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
 814) -> Dict[str, float]:
 815    """Extract Formant Frequency Features.
 816
 817    Function to extract formant frequency features from a given sound object. This function is adapted from default
 818    Praat code to work with Parselmouth.
 819
 820    Args:
 821        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 822        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 823        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 824        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 825
 826    Returns:
 827        dict: A dictionary containing the following keys:
 828
 829            - f1_mean (float): Mean F1 location.
 830            - f1_std (float): Standard deviation of F1 location.
 831            - b1_mean (float): Mean F1 bandwidth.
 832            - b1_std (float): Standard deviation of F1 bandwidth.
 833            - f2_mean (float): Mean F2 location.
 834            - f2_std (float): Standard deviation of F2 location.
 835            - b2_mean (float): Mean F2 bandwidth.
 836            - b2_std (float): Standard deviation of F2 bandwidth.
 837
 838    Examples:
 839        ```python
 840        >>> snd = parselmouth.Sound("path_to_audio.wav")
 841        >>> measureFormants(snd, 75, 500, 0.01)
 842        {'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
 843        'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
 844        ```
 845
 846    Notes:
 847        - Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
 848        - Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
 849        - Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
 850        - Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the
 851          sound where the vocal folds come together, helping to align the formant measurements precisely with the
 852          pitch periods.
 853        - Adapted from code at this [link](https://osf.io/6dwr3/).
 854    """
 855    if not PARSELMOUTH_AVAILABLE:
 856        raise ModuleNotFoundError(
 857            "`parselmouth` is not installed. "
 858            "Please install senselab audio dependencies using `pip install senselab`."
 859        )
 860
 861    try:
 862        if not isinstance(snd, parselmouth.Sound):
 863            snd = get_sound(snd)
 864
 865        # Extract formants
 866        formants = parselmouth.praat.call(snd, "To Formant (burg)", frame_shift, 5, 5000, 0.025, 50)
 867        # Key Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Formant__burg____.html
 868
 869        # Extract pitch using CC method
 870        pitch = snd.to_pitch_cc(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 871        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
 872
 873        F1_list, F2_list, B1_list, B2_list = [], [], [], []
 874        numPoints = parselmouth.praat.call(pulses, "Get number of points")
 875
 876        for point in range(1, numPoints + 1):
 877            t = parselmouth.praat.call(pulses, "Get time from index", point)
 878
 879            F1_value = parselmouth.praat.call(formants, "Get value at time", 1, t, "Hertz", "Linear")
 880            if not np.isnan(F1_value):
 881                F1_list.append(F1_value)
 882
 883            B1_value = parselmouth.praat.call(formants, "Get bandwidth at time", 1, t, "Hertz", "Linear")
 884            if not np.isnan(B1_value):
 885                B1_list.append(B1_value)
 886
 887            F2_value = parselmouth.praat.call(formants, "Get value at time", 2, t, "Hertz", "Linear")
 888            if not np.isnan(F2_value):
 889                F2_list.append(F2_value)
 890
 891            B2_value = parselmouth.praat.call(formants, "Get bandwidth at time", 2, t, "Hertz", "Linear")
 892            if not np.isnan(B2_value):
 893                B2_list.append(B2_value)
 894
 895        f1_mean, f1_std = (np.mean(F1_list), np.std(F1_list)) if F1_list else (np.nan, np.nan)
 896        b1_mean, b1_std = (np.mean(B1_list), np.std(B1_list)) if B1_list else (np.nan, np.nan)
 897        f2_mean, f2_std = (np.mean(F2_list), np.std(F2_list)) if F2_list else (np.nan, np.nan)
 898        b2_mean, b2_std = (np.mean(B2_list), np.std(B2_list)) if B2_list else (np.nan, np.nan)
 899
 900        return {
 901            "f1_mean": f1_mean,
 902            "f1_std": f1_std,
 903            "b1_mean": b1_mean,
 904            "b1_std": b1_std,
 905            "f2_mean": f2_mean,
 906            "f2_std": f2_std,
 907            "b2_mean": b2_mean,
 908            "b2_std": b2_std,
 909        }
 910
 911    except Exception as e:
 912        current_frame = inspect.currentframe()
 913        if current_frame is not None:
 914            current_function_name = current_frame.f_code.co_name
 915            logger.error(f'Error in "{current_function_name}": \n' + str(e))
 916        return {
 917            "f1_mean": np.nan,
 918            "f1_std": np.nan,
 919            "b1_mean": np.nan,
 920            "b1_std": np.nan,
 921            "f2_mean": np.nan,
 922            "f2_std": np.nan,
 923            "b2_mean": np.nan,
 924            "b2_std": np.nan,
 925        }
 926
 927
 928def extract_spectral_moments(
 929    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, window_size: float, frame_shift: float
 930) -> Dict[str, float]:
 931    """Extract Spectral Moments.
 932
 933    Function to extract spectral moments from a given sound object. This function is adapted from default
 934    Praat code to work with Parselmouth.
 935
 936    Args:
 937        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 938        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 939        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 940        window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
 941        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 942
 943    Returns:
 944        dict: A dictionary containing the following keys:
 945
 946            - spectral_gravity (float): Mean spectral gravity.
 947            - spectral_std_dev (float): Mean spectral standard deviation.
 948            - spectral_skewness (float): Mean spectral skewness.
 949            - spectral_kurtosis (float): Mean spectral kurtosis.
 950
 951    Examples:
 952        ```python
 953        >>> snd = parselmouth.Sound("path_to_audio.wav")
 954        >>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
 955        {'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
 956        ```
 957
 958    Notes:
 959        - Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency
 960        domain weighted by the power spectrum.
 961        - Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre
 962        of gravity.
 963        - Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different
 964        from the shape above the mean frequency.
 965        - Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different
 966          from a Gaussian shape.
 967        - Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
 968    """
 969    if not PARSELMOUTH_AVAILABLE:
 970        raise ModuleNotFoundError(
 971            "`parselmouth` is not installed. "
 972            "Please install senselab audio dependencies using `pip install senselab`."
 973        )
 974
 975    try:
 976        if not isinstance(snd, parselmouth.Sound):
 977            snd = get_sound(snd)
 978
 979        # Extract pitch object for voiced checking
 980        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 981
 982        # Calculate Spectrogram
 983        spectrogram = snd.to_spectrogram(window_length=window_size, time_step=frame_shift)
 984        # Using default settings other than window length and frame shift
 985        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Spectrogram___.html
 986
 987        Gravity_list, STD_list, Skew_list, Kurt_list = [], [], [], []
 988
 989        num_steps = parselmouth.praat.call(spectrogram, "Get number of frames")
 990        for i in range(1, num_steps + 1):
 991            t = parselmouth.praat.call(spectrogram, "Get time from frame number", i)
 992            pitch_value = pitch.get_value_at_time(t)
 993
 994            if not np.isnan(pitch_value):
 995                voiced_spectrum = spectrogram.to_spectrum_slice(t)
 996                # Details: https://www.fon.hum.uva.nl/praat/manual/Spectrogram__To_Spectrum__slice____.html
 997
 998                Gravity_LLD = voiced_spectrum.get_centre_of_gravity(power=2)
 999                if not np.isnan(Gravity_LLD):
1000                    Gravity_list.append(Gravity_LLD)
1001
1002                STD_LLD = voiced_spectrum.get_standard_deviation(power=2)
1003                if not np.isnan(STD_LLD):
1004                    STD_list.append(STD_LLD)
1005
1006                Skew_LLD = voiced_spectrum.get_skewness(power=2)
1007                if not np.isnan(Skew_LLD):
1008                    Skew_list.append(Skew_LLD)
1009
1010                Kurt_LLD = voiced_spectrum.get_kurtosis(power=2)
1011                if not np.isnan(Kurt_LLD):
1012                    Kurt_list.append(Kurt_LLD)
1013
1014        gravity_mean = np.mean(Gravity_list) if Gravity_list else np.nan
1015        std_mean = np.mean(STD_list) if STD_list else np.nan
1016        skew_mean = np.mean(Skew_list) if Skew_list else np.nan
1017        kurt_mean = np.mean(Kurt_list) if Kurt_list else np.nan
1018
1019        return {
1020            "spectral_gravity": gravity_mean,
1021            "spectral_std_dev": std_mean,
1022            "spectral_skewness": skew_mean,
1023            "spectral_kurtosis": kurt_mean,
1024        }
1025
1026    except Exception as e:
1027        current_frame = inspect.currentframe()
1028        if current_frame is not None:
1029            current_function_name = current_frame.f_code.co_name
1030            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1031        return {
1032            "spectral_gravity": np.nan,
1033            "spectral_std_dev": np.nan,
1034            "spectral_skewness": np.nan,
1035            "spectral_kurtosis": np.nan,
1036        }
1037
1038
1039### More functions ###
1040
1041
1042def extract_audio_duration(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
1043    """Get the duration of a given audio file or Audio object.
1044
1045    This function calculates the total duration of an audio file or audio object
1046    by creating a Parselmouth `Sound` object and then calling a Praat method
1047    to retrieve the duration of the audio in seconds.
1048
1049    Args:
1050        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
1051        a file path (Path), or an `Audio` object containing the audio waveform and
1052        its corresponding sampling rate.
1053
1054    Returns:
1055        Dict[str, float]: A dictionary containing:
1056            - "duration" (float): The total duration of the audio in seconds.
1057
1058    Raises:
1059        FileNotFoundError: If a provided file path does not exist.
1060
1061    Example:
1062        ```python
1063        >>> snd = Audio(waveform=[...], sampling_rate=16000)
1064        >>> extract_audio_duration(snd)
1065        {'duration': 5.23}
1066        ```
1067    """
1068    if not PARSELMOUTH_AVAILABLE:
1069        raise ModuleNotFoundError(
1070            "`parselmouth` is not installed. "
1071            "Please install senselab audio dependencies using `pip install senselab`."
1072        )
1073
1074    # Check if the input is a Path, in which case we load the audio from the file
1075    if not isinstance(snd, parselmouth.Sound):
1076        snd = get_sound(snd)
1077
1078    try:
1079        # Get the total duration of the sound
1080        duration = parselmouth.praat.call(snd, "Get total duration")
1081
1082        # Return the duration in a dictionary
1083        return {"duration": duration}
1084    except Exception as e:
1085        current_frame = inspect.currentframe()
1086        if current_frame is not None:
1087            current_function_name = current_frame.f_code.co_name
1088            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1089        return {"duration": np.nan}
1090
1091
1092def extract_jitter(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1093    """Returns the jitter descriptors for the given sound or audio file.
1094
1095    Args:
1096        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1097        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1098        floor (float): Minimum fundamental frequency (F0) in Hz.
1099        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1100
1101    Returns:
1102        Dict[str, float]: A dictionary containing various jitter measurements.
1103    """
1104
1105    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1106        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1107
1108    def _extract_jitter(type: str, point_process: parselmouth.Data) -> float:
1109        return parselmouth.praat.call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3)
1110
1111    if not PARSELMOUTH_AVAILABLE:
1112        raise ModuleNotFoundError(
1113            "`parselmouth` is not installed. "
1114            "Please install senselab audio dependencies using `pip install senselab`."
1115        )
1116
1117    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1118    if not isinstance(snd, parselmouth.Sound):
1119        snd = get_sound(snd)
1120
1121    try:
1122        # Convert the sound to a point process for jitter measurement
1123        point_process = _to_point_process(snd, floor, ceiling)
1124
1125        # Extract jitter measures from the point process
1126        return {
1127            "local_jitter": _extract_jitter("local", point_process),
1128            "localabsolute_jitter": _extract_jitter("local, absolute", point_process),
1129            "rap_jitter": _extract_jitter("rap", point_process),
1130            "ppq5_jitter": _extract_jitter("ppq5", point_process),
1131            "ddp_jitter": _extract_jitter("ddp", point_process),
1132        }
1133
1134    except Exception as e:
1135        current_frame = inspect.currentframe()
1136        if current_frame is not None:
1137            current_function_name = current_frame.f_code.co_name
1138            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1139        return {
1140            "local_jitter": np.nan,
1141            "localabsolute_jitter": np.nan,
1142            "rap_jitter": np.nan,
1143            "ppq5_jitter": np.nan,
1144            "ddp_jitter": np.nan,
1145        }
1146
1147
1148def extract_shimmer(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1149    """Returns the shimmer descriptors for the given sound or audio file.
1150
1151    Args:
1152        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1153        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1154        floor (float): Minimum fundamental frequency (F0) in Hz.
1155        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1156
1157    Returns:
1158        Dict[str, float]: A dictionary containing various shimmer measurements.
1159    """
1160
1161    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1162        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1163
1164    def _extract_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float:
1165        return parselmouth.praat.call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6)
1166
1167    if not PARSELMOUTH_AVAILABLE:
1168        raise ModuleNotFoundError(
1169            "`parselmouth` is not installed. "
1170            "Please install senselab audio dependencies using `pip install senselab`."
1171        )
1172
1173    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1174    if not isinstance(snd, parselmouth.Sound):
1175        snd = get_sound(snd)
1176
1177    try:
1178        # Convert the sound to a point process for shimmer measurement
1179        point_process = _to_point_process(snd, floor, ceiling)
1180
1181        # Extract shimmer measures from the sound and point process
1182        return {
1183            "local_shimmer": _extract_shimmer("local", snd, point_process),
1184            "localDB_shimmer": _extract_shimmer("local_dB", snd, point_process),
1185            "apq3_shimmer": _extract_shimmer("apq3", snd, point_process),
1186            "apq5_shimmer": _extract_shimmer("apq5", snd, point_process),
1187            "apq11_shimmer": _extract_shimmer("apq11", snd, point_process),
1188            "dda_shimmer": _extract_shimmer("dda", snd, point_process),
1189        }
1190
1191    except Exception as e:
1192        current_frame = inspect.currentframe()
1193        if current_frame is not None:
1194            current_function_name = current_frame.f_code.co_name
1195            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1196        return {
1197            "local_shimmer": np.nan,
1198            "localDB_shimmer": np.nan,
1199            "apq3_shimmer": np.nan,
1200            "apq5_shimmer": np.nan,
1201            "apq11_shimmer": np.nan,
1202            "dda_shimmer": np.nan,
1203        }
1204
1205
1206# Wrapper
1207def extract_praat_parselmouth_features_from_audios(
1208    audios: List[Audio],
1209    time_step: float = 0.005,
1210    window_length: float = 0.025,
1211    pitch_unit: str = "Hertz",
1212    speech_rate: bool = True,
1213    intensity_descriptors: bool = True,
1214    harmonicity_descriptors: bool = True,
1215    formants: bool = True,
1216    spectral_moments: bool = True,
1217    pitch: bool = True,
1218    slope_tilt: bool = True,
1219    cpp_descriptors: bool = True,
1220    duration: bool = True,
1221    jitter: bool = True,
1222    shimmer: bool = True,
1223) -> List[Dict[str, Any]]:
1224    """Extract features from a list of Audio objects and return a JSON-like dictionary.
1225
1226    Args:
1227        audios (list): List of Audio objects to extract features from.
1228        pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
1229        time_step (float): Time rate at which to extract features. Defaults to 0.005.
1230        window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
1231        speech_rate (bool): Whether to extract speech rate. Defaults to True.
1232        intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
1233        harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
1234        formants (bool): Whether to extract formants. Defaults to True.
1235        spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
1236        pitch (bool): Whether to extract pitch. Defaults to True.
1237        slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
1238        cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
1239        duration (bool): Whether to extract duration. Defaults to True.
1240        jitter (bool): Whether to extract jitter. Defaults to True.
1241        shimmer (bool): Whether to extract shimmer. Defaults to True.
1242
1243    Returns:
1244        dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".
1245    """
1246    extracted_data: List[Dict[str, Any]] = []
1247
1248    for snd in audios:
1249        # --- shared precomputations ---
1250        pitch_values_out = extract_pitch_values(snd=snd)
1251        pitch_floor = pitch_values_out["pitch_floor"]
1252        pitch_ceiling = pitch_values_out["pitch_ceiling"]
1253
1254        # Precompute blocks conditionally
1255        speech_rate_out = extract_speech_rate(snd=snd) if speech_rate else None
1256        pitch_out = (
1257            extract_pitch_descriptors(
1258                snd=snd,
1259                floor=pitch_floor,
1260                ceiling=pitch_ceiling,
1261                frame_shift=time_step,
1262                unit=pitch_unit,
1263            )
1264            if pitch
1265            else None
1266        )
1267        intensity_out = (
1268            extract_intensity_descriptors(
1269                snd=snd,
1270                floor=pitch_floor,
1271                frame_shift=time_step,
1272            )
1273            if intensity_descriptors
1274            else None
1275        )
1276        harmonicity_out = (
1277            extract_harmonicity_descriptors(
1278                snd=snd,
1279                floor=pitch_floor,
1280                frame_shift=time_step,
1281            )
1282            if harmonicity_descriptors
1283            else None
1284        )
1285        formants_out = (
1286            measure_f1f2_formants_bandwidths(
1287                snd=snd,
1288                floor=pitch_floor,
1289                ceiling=pitch_ceiling,
1290                frame_shift=time_step,
1291            )
1292            if formants
1293            else None
1294        )
1295        spectral_moments_out = (
1296            extract_spectral_moments(
1297                snd=snd,
1298                floor=pitch_floor,
1299                ceiling=pitch_ceiling,
1300                window_size=window_length,
1301                frame_shift=time_step,
1302            )
1303            if spectral_moments
1304            else None
1305        )
1306        slope_tilt_out = (
1307            extract_slope_tilt(
1308                snd=snd,
1309                floor=pitch_floor,
1310                ceiling=pitch_ceiling,
1311            )
1312            if slope_tilt
1313            else None
1314        )
1315        cpp_out = (
1316            extract_cpp_descriptors(
1317                snd=snd,
1318                floor=pitch_floor,
1319                ceiling=pitch_ceiling,
1320                frame_shift=time_step,
1321            )
1322            if cpp_descriptors
1323            else None
1324        )
1325        audio_duration_out = extract_audio_duration(snd=snd) if duration else None
1326        jitter_out = (
1327            extract_jitter(
1328                snd=snd,
1329                floor=pitch_floor,
1330                ceiling=pitch_ceiling,
1331            )
1332            if jitter
1333            else None
1334        )
1335        shimmer_out = (
1336            extract_shimmer(
1337                snd=snd,
1338                floor=pitch_floor,
1339                ceiling=pitch_ceiling,
1340            )
1341            if shimmer
1342            else None
1343        )
1344
1345        # --- collect outputs ---
1346        feature_data: Dict[str, Any] = {}
1347
1348        if duration and audio_duration_out is not None:
1349            feature_data["duration"] = audio_duration_out["duration"]
1350
1351        if speech_rate and speech_rate_out is not None:
1352            feature_data["speaking_rate"] = speech_rate_out["speaking_rate"]
1353            feature_data["articulation_rate"] = speech_rate_out["articulation_rate"]
1354            feature_data["phonation_ratio"] = speech_rate_out["phonation_ratio"]
1355            feature_data["pause_rate"] = speech_rate_out["pause_rate"]
1356            feature_data["mean_pause_duration"] = speech_rate_out["mean_pause_dur"]
1357
1358        if pitch and pitch_out is not None:
1359            unit_l = pitch_unit.lower()
1360            feature_data[f"mean_f0_{unit_l}"] = pitch_out[f"mean_f0_{unit_l}"]
1361            feature_data[f"std_f0_{unit_l}"] = pitch_out[f"stdev_f0_{unit_l}"]
1362
1363        if intensity_descriptors and intensity_out is not None:
1364            feature_data["mean_intensity_db"] = intensity_out["mean_db"]
1365            feature_data["std_intensity_db"] = intensity_out["std_db"]
1366            feature_data["range_ratio_intensity_db"] = intensity_out["range_db_ratio"]
1367
1368        if harmonicity_descriptors and harmonicity_out is not None:
1369            feature_data["mean_hnr_db"] = harmonicity_out["hnr_db_mean"]
1370            feature_data["std_hnr_db"] = harmonicity_out["hnr_db_std_dev"]
1371
1372        if slope_tilt and slope_tilt_out is not None:
1373            feature_data["spectral_slope"] = slope_tilt_out["spectral_slope"]
1374            feature_data["spectral_tilt"] = slope_tilt_out["spectral_tilt"]
1375
1376        if cpp_descriptors and cpp_out is not None:
1377            feature_data["cepstral_peak_prominence_mean"] = cpp_out["mean_cpp"]
1378            feature_data["cepstral_peak_prominence_std"] = cpp_out["std_dev_cpp"]
1379
1380        if formants and formants_out is not None:
1381            feature_data["mean_f1_loc"] = formants_out["f1_mean"]
1382            feature_data["std_f1_loc"] = formants_out["f1_std"]
1383            feature_data["mean_b1_loc"] = formants_out["b1_mean"]
1384            feature_data["std_b1_loc"] = formants_out["b1_std"]
1385            feature_data["mean_f2_loc"] = formants_out["f2_mean"]
1386            feature_data["std_f2_loc"] = formants_out["f2_std"]
1387            feature_data["mean_b2_loc"] = formants_out["b2_mean"]
1388            feature_data["std_b2_loc"] = formants_out["b2_std"]
1389
1390        if spectral_moments and spectral_moments_out is not None:
1391            feature_data["spectral_gravity"] = spectral_moments_out["spectral_gravity"]
1392            feature_data["spectral_std_dev"] = spectral_moments_out["spectral_std_dev"]
1393            feature_data["spectral_skewness"] = spectral_moments_out["spectral_skewness"]
1394            feature_data["spectral_kurtosis"] = spectral_moments_out["spectral_kurtosis"]
1395
1396        if jitter and jitter_out is not None:
1397            feature_data["local_jitter"] = jitter_out["local_jitter"]
1398            feature_data["localabsolute_jitter"] = jitter_out["localabsolute_jitter"]
1399            feature_data["rap_jitter"] = jitter_out["rap_jitter"]
1400            feature_data["ppq5_jitter"] = jitter_out["ppq5_jitter"]
1401            feature_data["ddp_jitter"] = jitter_out["ddp_jitter"]
1402
1403        if shimmer and shimmer_out is not None:
1404            feature_data["local_shimmer"] = shimmer_out["local_shimmer"]
1405            feature_data["localDB_shimmer"] = shimmer_out["localDB_shimmer"]
1406            feature_data["apq3_shimmer"] = shimmer_out["apq3_shimmer"]
1407            feature_data["apq5_shimmer"] = shimmer_out["apq5_shimmer"]
1408            feature_data["apq11_shimmer"] = shimmer_out["apq11_shimmer"]
1409            feature_data["dda_shimmer"] = shimmer_out["dda_shimmer"]
1410
1411        extracted_data.append(feature_data)
1412
1413    return extracted_data
def get_sound( audio: Union[pathlib.Path, senselab.audio.data_structures.audio.Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
49def get_sound(audio: Union[Path, Audio], sampling_rate: int = 16000) -> parselmouth.Sound:
50    """Get a sound object from a given audio file or Audio object.
51
52    Args:
53        audio (Union[Path, Audio]): A path to an audio file or an Audio object.
54        sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
55
56    Returns:
57        parselmouth.Sound: A Parselmouth Sound object.
58
59    Raises:
60        FileNotFoundError: If the file is not found at the given path.
61    """
62    if not PARSELMOUTH_AVAILABLE:
63        raise ModuleNotFoundError(
64            "`parselmouth` is not installed. "
65            "Please install senselab audio dependencies using `pip install senselab`."
66        )
67
68    try:
69        # Loading the sound
70        if isinstance(audio, Path):
71            audio = audio.resolve()
72            if not audio.exists():
73                logger.error(f"File does not exist: {audio}")
74                raise FileNotFoundError(f"File does not exist: {audio}")
75            snd_full = parselmouth.Sound(str(audio))
76        elif isinstance(audio, Audio):
77            snd_full = parselmouth.Sound(audio.waveform, audio.sampling_rate)
78
79        # Preprocessing
80        if parselmouth.praat.call(snd_full, "Get number of channels") > 1:
81            snd_full = snd_full.convert_to_mono()
82        if parselmouth.praat.call(snd_full, "Get sampling frequency") != sampling_rate:
83            snd_full = parselmouth.praat.call(snd_full, "Resample", sampling_rate, 50)
84            # Details of queery: https://www.fon.hum.uva.nl/praat/manual/Get_sampling_frequency.html
85            # Details of conversion: https://www.fon.hum.uva.nl/praat/manual/Sound__Resample___.html
86    except Exception as e:
87        raise RuntimeError(f"Error loading sound: {e}")
88    return snd_full

Get a sound object from a given audio file or Audio object.

Arguments:
  • audio (Union[Path, Audio]): A path to an audio file or an Audio object.
  • sampling_rate (int, optional): The sampling rate of the audio. Defaults to 16000.
Returns:

parselmouth.Sound: A Parselmouth Sound object.

Raises:
  • FileNotFoundError: If the file is not found at the given path.
def extract_speech_rate( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
 91def extract_speech_rate(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
 92    """Extract speech timing and pausing features from a given sound object.
 93
 94    Args:
 95        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 96
 97    Returns:
 98        Dict[str, float]: A dictionary containing the following features:
 99
100            - speaking_rate (float): Number of syllables divided by duration.
101            - articulation_rate (float): Number of syllables divided by phonation time.
102            - phonation_ratio (float): Phonation time divided by duration.
103            - pause_rate (float): Number of pauses divided by duration.
104            - mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
105
106    Examples:
107        ```python
108        >>> snd = parselmouth.Sound("path_to_audio.wav")
109        >>> extract_speech_rate(snd)
110        {
111            'speaking_rate': 5.3,
112            'articulation_rate': 4.7,
113            'phonation_ratio': 0.9,
114            'pause_rate': 2.1,
115            'mean_pause_dur': 0.5
116        }
117        ```
118
119    Useful sources for this code:
120
121        - https://sites.google.com/view/uhm-o-meter/scripts/syllablenuclei_v3?pli=1
122        - https://drive.google.com/file/d/1o3mNdN5FKTiYQC9GHB1XoZ8JJIGZk_AK/view
123        - (2009 paper) https://doi.org/10.3758/BRM.41.2.385
124        - (2021 paper) https://doi.org/10.1080/0969594X.2021.1951162
125    """
126    if not PARSELMOUTH_AVAILABLE:
127        raise ModuleNotFoundError(
128            "`parselmouth` is not installed. "
129            "Please install senselab audio dependencies using `pip install senselab`."
130        )
131
132    try:
133        # _____________________________________________________________________________________________________________
134        # Load the sound object into parselmouth if it is an Audio object
135        if not isinstance(snd, parselmouth.Sound):
136            snd = get_sound(snd)
137
138        # _____________________________________________________________________________________________________________
139        # Key pause detection hyperparameters
140
141        # Silence Threshold (dB) - standard setting to detect silence in the "To TextGrid (silences)" function.
142        # The higher this number, the lower the chances of finding silent pauses
143        silence_db = -25
144
145        # Minimum_dip_between_peaks_(dB) - if there are decreases in intensity
146        # of at least this value surrounding the peak, the peak is labelled to be a syllable nucleus
147        # I.e. the size of the dip between two possible peakes
148        # The higher this number, the less syllables will be found
149        # For clean and filtered signal use 4, if not use 2 (recommend thresholds)
150        min_dip = 4
151        # Code for determining if the signal not clean/filtered
152        hnr = parselmouth.praat.call(
153            snd.to_harmonicity_cc(), "Get mean", 0, 0
154        )  # Note: (0,0) is the time range for extraction, setting both two zero tells praat to use the full file
155        if hnr < 60:
156            min_dip = 2
157
158        # Minimum pause duration (s): How long should a pause be to be counted as a silent pause?
159        # The higher this number, the fewer pauses will be found
160        min_pause = 0.3  # the default for this is 0.1 in Praat, the de Jong's script has this set at 0.3
161        # Based on values in: Toward an understanding of fluency:
162        # A microanalysis of nonnative speaker conversations (Riggenbach)
163        # – Micropause (silence of .2s or less)
164        # – Hesitation (silence of .3 to .4s)
165        # – Unfilled pause (silence of .5s or more)
166
167        # ______________________________________________________________________________________________________________
168        # Intensity information
169
170        intensity = snd.to_intensity(minimum_pitch=50, time_step=0.016, subtract_mean=True)
171        # These are the setting recommended by de jong - minimum pitch” set to 50 Hz,.
172        # With this parameter setting, we extract intensity smoothed over a time window of (3.2/minimum_pitch)=64 msec,
173        #  with 16-msec time steps explanation on these calculations are found at:
174        # https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
175
176        min_intensity = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "Parabolic")  # time range, Interpolation
177        max_intensity = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "Parabolic")  # time range, Interpolation
178
179        # Silince is detected by measuring whether the intensity is 25 dB below the 99% highest peak
180        # 99% is chosen to eliminate short loud bursts in intensity that may not have been speech
181
182        # get .99 quantile to get maximum (without influence of non-speech sound bursts)
183        max_99_intensity = parselmouth.praat.call(intensity, "Get quantile", 0, 0, 0.99)
184
185        # estimate Intensity threshold
186        silence_db_1 = max_99_intensity + silence_db
187        db_adjustment = max_intensity - max_99_intensity
188        silence_db_2 = silence_db - db_adjustment
189        if silence_db_1 < min_intensity:
190            silence_db_1 = min_intensity
191
192        # ______________________________________________________________________________________________________________
193        # Create a TextGrid in which the silent and sounding intervals, store these intervals
194
195        textgrid = parselmouth.praat.call(
196            intensity, "To TextGrid (silences)", silence_db_2, min_pause, 0.1, "silent", "sounding"
197        )
198        # Hyperparameters:
199        # Silence threshold (dB),
200        # Minimum silent interval (s) - minimum duration for an interval to be considered as silent
201        # Minimum sounding interval (s) - minimum duration for an interval to be not considered as silent
202        # Silent interval label
203        # Sounding interval label
204
205        # Loop through intervals and extract times of identified silent and sounding sections
206        silencetier = parselmouth.praat.call(textgrid, "Extract tier", 1)
207        silencetable = parselmouth.praat.call(silencetier, "Down to TableOfReal", "sounding")
208        npauses = parselmouth.praat.call(silencetable, "Get number of rows")
209
210        phonation_time = 0
211        for ipause in range(npauses):
212            pause = ipause + 1
213            beginsound = parselmouth.praat.call(silencetable, "Get value", pause, 1)
214            endsound = parselmouth.praat.call(silencetable, "Get value", pause, 2)
215            speakingdur = endsound - beginsound
216
217            phonation_time += speakingdur
218
219            # This is to remove the first (before first word) and last (after last word) silence from consideration
220            if pause == 1:
221                begin_speak = beginsound
222            if pause == (npauses):
223                end_speak = endsound
224
225        # ______________________________________________________________________________________________________________
226        # Next block of code finds all possible peaks
227
228        # Convert intensity countor into sound representation
229        intensity_matrix = parselmouth.praat.call(intensity, "Down to Matrix")  # convert intensity to 2d representation
230
231        # Convert intensity countor into sound representation
232        sound_from_intensity_matrix = parselmouth.praat.call(intensity_matrix, "To Sound (slice)", 1)
233
234        # find positive extrema, maxima in sound_from_intensity_matrix, which correspond to steepest rises in Intensity;
235        point_process = parselmouth.praat.call(
236            sound_from_intensity_matrix,
237            "To PointProcess (extrema)",
238            "Left",
239            "yes",
240            "no",
241            "Sinc70",
242        )
243
244        # estimate peak positions (all peaks)
245        t = []
246        numpeaks = parselmouth.praat.call(point_process, "Get number of points")
247        for i in range(numpeaks):
248            t.append(parselmouth.praat.call(point_process, "Get time from index", i + 1))
249
250        # ______________________________________________________________________________________________________________
251        # Find the time and values of all peaks
252
253        # fill array with intensity values
254        timepeaks = []
255        peakcount = 0
256        intensities = []
257        for i in range(numpeaks):
258            value = parselmouth.praat.call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
259            if value > silence_db_1:
260                peakcount += 1
261                intensities.append(value)
262                timepeaks.append(t[i])
263
264        # ______________________________________________________________________________________________________________
265        # Now find all valid peaks
266
267        # fill array with valid peaks: only intensity values if preceding
268        # dip in intensity is greater than min_dip
269        validpeakcount = 0
270        currenttime = timepeaks[0]
271        currentint = intensities[0]
272        validtime = []
273
274        for p in range(peakcount - 1):
275            following = p + 1
276            followingtime = timepeaks[following]
277            dip = parselmouth.praat.call(
278                intensity, "Get minimum", currenttime, followingtime, "None"
279            )  # Gets minimiun value between two time points, doesn't intepolote/filter
280            diffint = abs(currentint - dip)
281            if diffint > min_dip:
282                validpeakcount += 1
283                validtime.append(timepeaks[p])
284            # Update current time and intensity values for next loop
285            currenttime = timepeaks[following]
286            currentint = parselmouth.praat.call(intensity, "Get value at time", timepeaks[following], "Cubic")
287
288        # ______________________________________________________________________________________________________________
289        # Extract voicing information
290
291        pitch = snd.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
292        # Praat page for hyperparamters https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
293        # From de Jong's 2009 paper - We extract the pitch contour, this time using a window size of 100 msec
294        # and 20-msec time steps, and exclude all peaks that are unvoiced
295        # Key Hyperparamter are different to praat recommended - can't find a reason for this
296        # time_step: Optional[Positive[float]] = None,  - set per De jong's recommendation
297        # pitch_floor: Positive[float] = 75.0 set per dejong recommendation - 3/30 gives 100ms
298        # max_number_of_candidates: Positive[int] = 15 (can't find a reason for this value being lower)
299        # very_accurate: bool = False,
300        # silence_threshold: float = 0.03,
301        # voicing_threshold: float = 0.45, (can't find a reason for this value being different)
302        # octave_cost: float = 0.01,
303        # octave_jump_cost: float = 0.35,
304        # voiced_unvoiced_cost: float = 0.14, (can't find a reason for this value being different)
305        # pitch_ceiling: Positive[float] = 600.0 (can't find a reason for this value being lower, might change to value
306        # from pitch_value function)
307
308        # ______________________________________________________________________________________________________________
309        # Loop through valid peaks, count ones that are voiced (i.e., have valid pitch value at the same time)
310
311        number_syllables = int(0)
312        for time in range(validpeakcount):
313            querytime = validtime[time]
314            whichinterval = parselmouth.praat.call(textgrid, "Get interval at time", 1, querytime)
315            whichlabel = parselmouth.praat.call(textgrid, "Get label of interval", 1, whichinterval)
316            value = pitch.get_value_at_time(querytime)
317            if not np.isnan(value):
318                if whichlabel == "sounding":
319                    number_syllables += 1
320
321        # ______________________________________________________________________________________________________________
322        # return results
323
324        original_dur = end_speak - begin_speak
325
326        speaking_rate = number_syllables / original_dur
327        articulation_rate = number_syllables / phonation_time
328        phonation_ratio = phonation_time / original_dur
329
330        number_pauses = npauses - 1
331        pause_time = original_dur - phonation_time
332
333        pause_rate = number_pauses / original_dur
334        mean_pause_dur = pause_time / number_pauses if number_pauses > 0 else 0.0
335
336        return {
337            "speaking_rate": speaking_rate,
338            "articulation_rate": articulation_rate,
339            "phonation_ratio": phonation_ratio,
340            "pause_rate": pause_rate,
341            "mean_pause_dur": mean_pause_dur,
342        }
343
344    except Exception as e:
345        current_frame = inspect.currentframe()
346        if current_frame is not None:
347            current_function_name = current_frame.f_code.co_name
348            logger.error(f'Error in "{current_function_name}": \n' + str(e))
349        return {
350            "speaking_rate": np.nan,
351            "articulation_rate": np.nan,
352            "phonation_ratio": np.nan,
353            "pause_rate": np.nan,
354            "mean_pause_dur": np.nan,
355        }

Extract speech timing and pausing features from a given sound object.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
Returns:

Dict[str, float]: A dictionary containing the following features:

- speaking_rate (float): Number of syllables divided by duration.
- articulation_rate (float): Number of syllables divided by phonation time.
- phonation_ratio (float): Phonation time divided by duration.
- pause_rate (float): Number of pauses divided by duration.
- mean_pause_dur (float): Total time pausing divided by the number of identified pauses.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_speech_rate(snd)
{
    'speaking_rate': 5.3,
    'articulation_rate': 4.7,
    'phonation_ratio': 0.9,
    'pause_rate': 2.1,
    'mean_pause_dur': 0.5
}
Useful sources for this code:
def extract_pitch_values( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
358def extract_pitch_values(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
359    """Estimate Pitch Range.
360
361    Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction
362    algorithms.
363
364    Args:
365        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
366
367    Returns:
368        dict: A dictionary containing the following keys:
369
370            - pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
371            - pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
372
373    Notes:
374        Values are taken from: [Standardization of pitch-range settings in voice acoustic analysis](https://doi.org/10.3758/BRM.41.2.318)
375
376        The problem observed with doing a really broad pitch search was the occasional error if F1 was low.
377        So crude outlier detection is used to help with this.
378
379        Important: These values are used within other functions, they are not outputs of the full code.
380
381        Different pitch extraction methods in Praat:
382
383        - Cross-correlation (Praat default) vs auto-correlation pitch extraction:
384        both are used in different functions below.
385        - Cross-correlation is better than auto-correlation at finding period-level variation,
386        such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
387        - [Discussion on this on a Praat Forum](https://groups.io/g/Praat-Users-List/topic/pitch_detection_ac_vs_cc/78829266?p=,,,20,0,0,0::recentpostdate/sticky,,,20,2,20,78829266,previd=1612369050729515119,nextid=1605568402827788039&previd=1612369050729515119&nextid=1605568402827788039)
388
389    Examples:
390        ```python
391        >>> snd = parselmouth.Sound("path_to_audio.wav")
392        >>> pitch_values(snd)
393        {'pitch_floor': 60, 'pitch_ceiling': 250}
394        ```
395    """
396    if not PARSELMOUTH_AVAILABLE:
397        raise ModuleNotFoundError(
398            "`parselmouth` is not installed. "
399            "Please install senselab audio dependencies using `pip install senselab`."
400        )
401
402    try:
403        if not isinstance(snd, parselmouth.Sound):
404            snd = get_sound(snd)
405
406        pitch_wide = snd.to_pitch_ac(time_step=0.005, pitch_floor=50, pitch_ceiling=600)
407        # Other than values above, I'm using default hyperparamters
408        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
409
410        # remove outliers from wide pitch search
411        pitch_values = pitch_wide.selected_array["frequency"]
412        pitch_values = pitch_values[pitch_values != 0]
413        pitch_values_Z = (pitch_values - np.mean(pitch_values)) / np.std(pitch_values)
414        pitch_values_filtered = pitch_values[abs(pitch_values_Z) <= 2]
415
416        mean_pitch = np.mean(pitch_values_filtered)
417
418        # Here there is an interesting alternative solution to discuss: https://praatscripting.lingphon.net/conditionals-1.html
419        if mean_pitch < 170:
420            # 'male' settings
421            pitch_floor = 60.0
422            pitch_ceiling = 250.0
423        else:
424            # 'female' and 'child' settings
425            pitch_floor = 100.0
426            pitch_ceiling = 500.0
427
428        return {"pitch_floor": pitch_floor, "pitch_ceiling": pitch_ceiling}
429    except Exception as e:
430        current_frame = inspect.currentframe()
431        if current_frame is not None:
432            current_function_name = current_frame.f_code.co_name
433            logger.error(f'Error in "{current_function_name}": \n' + str(e))
434        return {"pitch_floor": np.nan, "pitch_ceiling": np.nan}

Estimate Pitch Range.

Calculates the mean pitch using a wide range and uses this to shorten the range for future pitch extraction algorithms.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
Returns:

dict: A dictionary containing the following keys:

- pitch_floor (float): The lowest pitch value to use in future pitch extraction algorithms.
- pitch_ceiling (float): The highest pitch value to use in future pitch extraction algorithms.
Notes:

Values are taken from: Standardization of pitch-range settings in voice acoustic analysis

The problem observed with doing a really broad pitch search was the occasional error if F1 was low. So crude outlier detection is used to help with this.

Important: These values are used within other functions, they are not outputs of the full code.

Different pitch extraction methods in Praat:

  • Cross-correlation (Praat default) vs auto-correlation pitch extraction: both are used in different functions below.
  • Cross-correlation is better than auto-correlation at finding period-level variation, such as jitter and shimmer, whereas auto-correlation is better at finding intended intonation contours.
  • Discussion on this on a Praat Forum
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> pitch_values(snd)
{'pitch_floor': 60, 'pitch_ceiling': 250}
def extract_pitch_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float = 0.005, unit: str = 'Hertz') -> Dict[str, float]:
437def extract_pitch_descriptors(
438    snd: Union[parselmouth.Sound, Path, Audio],
439    floor: float,
440    ceiling: float,
441    frame_shift: float = 0.005,
442    unit: str = "Hertz",
443) -> Dict[str, float]:
444    """Extract Pitch Features.
445
446    Function to extract key pitch features from a given sound object.
447    This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.
448
449    Args:
450        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
451        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
452        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
453        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
454            Defaults to 0.005.
455        unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz".
456            Could be "semitones".
457
458    Returns:
459        dict: A dictionary containing the following keys:
460
461            - mean_f0_{unit} (float): Mean pitch in {unit}.
462            - stdev_f0_{unit} (float): Standard deviation in {unit}.
463
464    Notes:
465        - Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
466        - stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
467
468    Examples:
469        ```python
470        >>> snd = parselmouth.Sound("path_to_audio.wav")
471        >>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
472        {'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
473        ```
474    """
475    if not PARSELMOUTH_AVAILABLE:
476        raise ModuleNotFoundError(
477            "`parselmouth` is not installed. "
478            "Please install senselab audio dependencies using `pip install senselab`."
479        )
480
481    try:
482        if not isinstance(snd, parselmouth.Sound):
483            snd = get_sound(snd)
484
485        # Extract pitch object
486        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
487        # Other than values above, I'm using default hyperparameters
488        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Pitch__ac____.html
489
490        # Extract mean, median, and standard deviation
491        mean_f0 = parselmouth.praat.call(pitch, "Get mean", 0, 0, unit)  # time range, units
492        stdev_f0 = parselmouth.praat.call(pitch, "Get standard deviation", 0, 0, unit)
493
494        # Return results
495        return {f"mean_f0_{unit.lower()}": mean_f0, f"stdev_f0_{unit.lower()}": stdev_f0}
496    except Exception as e:
497        current_frame = inspect.currentframe()
498        if current_frame is not None:
499            current_function_name = current_frame.f_code.co_name
500            logger.error(f'Error in "{current_function_name}": \n' + str(e))
501        return {f"mean_f0_{unit.lower()}": np.nan, f"stdev_f0_{unit.lower()}": np.nan}

Extract Pitch Features.

Function to extract key pitch features from a given sound object. This function uses the pitch_ac method as autocorrelation is better at finding intended intonation contours.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms. Defaults to 0.005.
  • unit (str, optional): The unit in which the pitch is returned. Defaults to "Hertz". Could be "semitones".
Returns:

dict: A dictionary containing the following keys:

- mean_f0_{unit} (float): Mean pitch in {unit}.
- stdev_f0_{unit} (float): Standard deviation in {unit}.
Notes:
  • Uses pitch_ac as autocorrelation is better at finding intended intonation contours.
  • stdev_f0_semitone is used in DOI: 10.1080/02699200400008353, which used this as a marker for dysphonia.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_pitch_descriptors(snd, 75, 500, 0.01, "Hertz")
{'mean_f0_hertz': 220.5, 'stdev_f0_hertz': 2.5}
def extract_intensity_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, frame_shift: float) -> Dict[str, float]:
504def extract_intensity_descriptors(
505    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
506) -> Dict[str, float]:
507    """Extract Intensity Features.
508
509    Function to extract key intensity information from a given sound object.
510    This function is based on default Praat code adapted to work with Parselmouth.
511
512    Args:
513        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
514        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
515        frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
516
517    Returns:
518        dict: A dictionary containing the following keys:
519
520            - mean_db (float): Mean intensity in dB.
521            - std_db (float): Standard deviation in dB.
522            - range_db_ratio (float): Intensity range, expressed as a ratio in dB.
523
524    Examples:
525        ```python
526        >>> snd = parselmouth.Sound("path_to_audio.wav")
527        >>> extract_intensity_descriptors(snd, 75, 0.01)
528        {'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
529        ```
530
531    Notes:
532        - Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
533        - For notes on extracting mean settings: https://www.fon.hum.uva.nl/praat/manual/Intro_6_2__Configuring_the_intensity_contour.html
534    """
535    if not PARSELMOUTH_AVAILABLE:
536        raise ModuleNotFoundError(
537            "`parselmouth` is not installed. "
538            "Please install senselab audio dependencies using `pip install senselab`."
539        )
540
541    try:
542        if not isinstance(snd, parselmouth.Sound):
543            snd = get_sound(snd)
544
545        # Extract intensity object
546        intensity = snd.to_intensity(minimum_pitch=floor, time_step=frame_shift, subtract_mean=True)
547        # Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Intensity___.html
548
549        # Extract descriptors
550        mean_db = parselmouth.praat.call(
551            intensity, "Get mean", 0, 0, "energy"
552        )  # get mean - time range, time range, averaging method
553        std_db = parselmouth.praat.call(intensity, "Get standard deviation", 0, 0)
554        min_dB = parselmouth.praat.call(intensity, "Get minimum", 0, 0, "parabolic")  # time range, Interpolation
555        max_dB = parselmouth.praat.call(intensity, "Get maximum", 0, 0, "parabolic")  # time range, Interpolation
556        range_db_ratio = max_dB / min_dB
557
558        # Return results
559        return {"mean_db": mean_db, "std_db": std_db, "range_db_ratio": range_db_ratio}
560
561    except Exception as e:
562        current_frame = inspect.currentframe()
563        if current_frame is not None:
564            current_function_name = current_frame.f_code.co_name
565            logger.error(f'Error in "{current_function_name}": \n' + str(e))
566        return {"mean_db": np.nan, "std_db": np.nan, "range_db_ratio": np.nan}

Extract Intensity Features.

Function to extract key intensity information from a given sound object. This function is based on default Praat code adapted to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new intensity value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- mean_db (float): Mean intensity in dB.
- std_db (float): Standard deviation in dB.
- range_db_ratio (float): Intensity range, expressed as a ratio in dB.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_intensity_descriptors(snd, 75, 0.01)
{'mean_db': 70.5, 'std_db': 0.5, 'range_db_ratio': 2.5}
Notes:
def extract_harmonicity_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, frame_shift: float) -> Dict[str, float]:
569def extract_harmonicity_descriptors(
570    snd: Union[parselmouth.Sound, Path, Audio], floor: float, frame_shift: float
571) -> Dict[str, float]:
572    """Voice Quality - HNR.
573
574    Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object.
575    This function uses the CC method as recommended by Praat.
576
577    Args:
578        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
579        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
580        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
581
582    Returns:
583        dict: A dictionary containing the following key:
584
585            - hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
586            - hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
587
588    Examples:
589        ```python
590        >>> snd = parselmouth.Sound("path_to_audio.wav")
591        >>> extract_harmonicity_descriptors(snd, 75, 0.01)
592        {'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
593        ```
594
595    Notes:
596        - Praat recommends using the CC method: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
597        - Default settings can be found at: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__ac____.html
598    """
599    if not PARSELMOUTH_AVAILABLE:
600        raise ModuleNotFoundError(
601            "`parselmouth` is not installed. "
602            "Please install senselab audio dependencies using `pip install senselab`."
603        )
604
605    try:
606        if not isinstance(snd, parselmouth.Sound):
607            snd = get_sound(snd)
608
609        # Extract HNR information
610        harmonicity = snd.to_harmonicity_cc(
611            time_step=frame_shift, minimum_pitch=floor, silence_threshold=0.1, periods_per_window=4.5
612        )
613        # Praat recommends using the CC method here: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Harmonicity__cc____.html
614
615        hnr_db_mean = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)
616        hnr_db_std_dev = parselmouth.praat.call(harmonicity, "Get standard deviation", 0, 0)
617
618        return {"hnr_db_mean": hnr_db_mean, "hnr_db_std_dev": hnr_db_std_dev}
619    except Exception as e:
620        current_frame = inspect.currentframe()
621        if current_frame is not None:
622            current_function_name = current_frame.f_code.co_name
623            logger.error(f'Error in "{current_function_name}": \n' + str(e))
624
625        return {"hnr_db_mean": np.nan, "hnr_db_std_dev": np.nan}

Voice Quality - HNR.

Function to calculate the Harmonic to Noise Ratio (HNR) in dB from a given sound object. This function uses the CC method as recommended by Praat.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following key:

- hnr_db_mean (float): Mean Harmonic to Noise Ratio in dB.
- hnr_db_std_dev (float): Harmonic to Noise Ratio standard deviation in dB.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_harmonicity_descriptors(snd, 75, 0.01)
{'hnr_db_mean': 15.3, 'hnr_db_std_dev': 0.5}
Notes:
def extract_slope_tilt( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
628def extract_slope_tilt(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
629    """Voice Quality - Spectral Slope/Tilt.
630
631    Function to extract spectral slope and tilt from a given sound object. This function is based on default
632    Praat code adapted to work with Parselmouth.
633
634    Args:
635        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
636        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
637        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
638
639    Returns:
640        dict: A dictionary containing the following keys:
641
642            - spectral_slope (float): Mean spectral slope.
643            - spectral_tilt (float): Mean spectral tilt.
644
645    Examples:
646        ```python
647        >>> snd = parselmouth.Sound("path_to_audio.wav")
648        >>> extract_slope_tilt(snd, 75, 500)
649        {'spectral_slope': -0.8, 'spectral_tilt': -2.5}
650        ```
651
652    Notes:
653        - Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
654        - Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
655        - Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation:
656        https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
657    """
658    if not PARSELMOUTH_AVAILABLE:
659        raise ModuleNotFoundError(
660            "`parselmouth` is not installed. "
661            "Please install senselab audio dependencies using `pip install senselab`."
662        )
663
664    try:
665        if not isinstance(snd, parselmouth.Sound):
666            snd = get_sound(snd)
667
668        ltas_rep = parselmouth.praat.call(
669            snd, "To Ltas (pitch-corrected)...", floor, ceiling, 5000, 100, 0.0001, 0.02, 1.3
670        )
671        # Hyperparameters: Min Pitch (Hz), Max Pitch (Hz), Maximum Frequency (Hz), Bandwidth (Hz), Shortest Period (s),
672        # Longest Period (s), Maximum period factor
673
674        spectral_slope = parselmouth.praat.call(ltas_rep, "Get slope", 50, 1000, 1000, 4000, "dB")
675        # Hyperparameters: f1min, f1max, f2min, f2max, averagingUnits
676
677        spectral_tilt_Report = parselmouth.praat.call(ltas_rep, "Report spectral tilt", 100, 5000, "Linear", "Robust")
678        # Hyperparameters: minimumFrequency, maximumFrequency, Frequency Scale (linear or logarithmic),
679        # Fit method (least squares or robust)
680
681        srt_st = spectral_tilt_Report.index("Slope: ") + len("Slope: ")
682        end_st = spectral_tilt_Report.index("d", srt_st)
683        spectral_tilt = float(spectral_tilt_Report[srt_st:end_st])
684
685        # Return results
686        return {"spectral_slope": spectral_slope, "spectral_tilt": spectral_tilt}
687
688    except Exception as e:
689        current_frame = inspect.currentframe()
690        if current_frame is not None:
691            current_function_name = current_frame.f_code.co_name
692            logger.error(f'Error in "{current_function_name}": \n' + str(e))
693        return {"spectral_slope": np.nan, "spectral_tilt": np.nan}

Voice Quality - Spectral Slope/Tilt.

Function to extract spectral slope and tilt from a given sound object. This function is based on default Praat code adapted to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
Returns:

dict: A dictionary containing the following keys:

- spectral_slope (float): Mean spectral slope.
- spectral_tilt (float): Mean spectral tilt.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_slope_tilt(snd, 75, 500)
{'spectral_slope': -0.8, 'spectral_tilt': -2.5}
Notes:
  • Spectral Slope: Ratio of energy in a spectra between 10-1000Hz over 1000-4000Hz.
  • Spectral Tilt: Linear slope of energy distribution between 100-5000Hz.
  • Using pitch-corrected LTAS to remove the effect of F0 and harmonics on the slope calculation: https://www.fon.hum.uva.nl/paul/papers/BoersmaKovacic2006.pdf
def extract_cpp_descriptors( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float) -> Dict[str, float]:
696def extract_cpp_descriptors(
697    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
698) -> Dict[str, float]:
699    """Extract Cepstral Peak Prominence (CPP).
700
701    Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object.
702    This function is adapted from default Praat code to work with Parselmouth.
703
704    Args:
705        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
706        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
707        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
708        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
709
710    Returns:
711        dict: A dictionary containing the following key:
712
713            - mean_cpp (float): Mean Cepstral Peak Prominence.
714            - std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
715
716    Examples:
717        ```python
718        >>> snd = parselmouth.Sound("path_to_audio.wav")
719        >>> extract_CPP(snd, 75, 500, 0.01)
720        {'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
721        ```
722
723    Notes:
724        - Cepstral Peak Prominence: The height (i.e., “prominence”) of that peak relative to a regression line
725        through the overall cepstrum.
726        - Adapted from: https://osf.io/ctwgr and http://phonetics.linguistics.ucla.edu/facilities/acoustic/voiced_extract_auto.txt
727    """
728    if not PARSELMOUTH_AVAILABLE:
729        raise ModuleNotFoundError(
730            "`parselmouth` is not installed. "
731            "Please install senselab audio dependencies using `pip install senselab`."
732        )
733
734    try:
735        if not isinstance(snd, parselmouth.Sound):
736            snd = get_sound(snd)
737
738        # Extract pitch object for voiced checking
739        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling, voicing_threshold=0.3)
740
741        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
742
743        textgrid = parselmouth.praat.call(pulses, "To TextGrid (vuv)", 0.02, 0.1)
744
745        vuv_table = parselmouth.praat.call(textgrid, "Down to Table", "no", 6, "yes", "no")
746        # Variables - include line number, Time decimals, include tier names, include empty intervals
747
748        cpp_list = []
749
750        n_intervals = parselmouth.praat.call(vuv_table, "Get number of rows")
751        for i in range(n_intervals):
752            label = parselmouth.praat.call(vuv_table, "Get value", i + 1, "text")
753            if label == "V":
754                tmin = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmin")
755                tmax = parselmouth.praat.call(vuv_table, "Get value", i + 1, "tmax")
756                snd_segment = snd.extract_part(float(tmin), float(tmax))
757
758                PowerCepstrogram = parselmouth.praat.call(snd_segment, "To PowerCepstrogram", 60, 0.002, 5000, 50)
759                # PowerCepstrogram (60-Hz pitch floor, 2-ms time step, 5-kHz maximum frequency,
760                # and pre-emphasis from 50 Hz)
761
762                try:
763                    CPP_Value = parselmouth.praat.call(
764                        PowerCepstrogram,
765                        "Get CPPS...",
766                        "no",
767                        0.01,
768                        0.001,
769                        60,
770                        330,
771                        0.05,
772                        "parabolic",
773                        0.001,
774                        0,
775                        "Straight",
776                        "Robust",
777                    )
778                    # Subtract tilt before smoothing = “no”; time averaging window = 0.01 s;
779                    # quefrency averaging window = 0.001 s;
780                    # Peak search pitch range = 60–330 Hz; tolerance = 0.05; interpolation = “Parabolic”;
781                    # tilt line frequency range = 0.001–0 s (no upper bound);
782                    # Line type = “Straight”; fit method = “Robust.”
783                except Exception as e:
784                    current_frame = inspect.currentframe()
785                    if current_frame is not None:
786                        current_function_name = current_frame.f_code.co_name
787                        logger.error(f'Error in "{current_function_name}": \n' + str(e))
788                    CPP_Value = np.nan
789
790                if not np.isnan(CPP_Value) and CPP_Value > 4:
791                    cpp_list.append(CPP_Value)
792
793        # Calculate Final Features
794        if cpp_list:
795            CPP_array = np.array(cpp_list)
796            CPP_mean = np.mean(CPP_array)
797            CPP_std = np.std(CPP_array)
798        else:
799            CPP_mean = np.nan
800            CPP_std = np.nan
801
802        # Return Result
803        return {"mean_cpp": CPP_mean, "std_dev_cpp": CPP_std}
804
805    except Exception as e:
806        current_frame = inspect.currentframe()
807        if current_frame is not None:
808            current_function_name = current_frame.f_code.co_name
809            logger.error(f'Error in "{current_function_name}": \n' + str(e))
810        return {"mean_cpp": np.nan, "std_dev_cpp": np.nan}

Extract Cepstral Peak Prominence (CPP).

Function to calculate the Cepstral Peak Prominence (CPP) from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following key:

- mean_cpp (float): Mean Cepstral Peak Prominence.
- std_dev_cpp (float): Standard deviation in Cepstral Peak Prominence.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_CPP(snd, 75, 500, 0.01)
{'mean_cpp': 20.3, 'std_dev_cpp': 0.5}
Notes:
def measure_f1f2_formants_bandwidths( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, frame_shift: float) -> Dict[str, float]:
813def measure_f1f2_formants_bandwidths(
814    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, frame_shift: float
815) -> Dict[str, float]:
816    """Extract Formant Frequency Features.
817
818    Function to extract formant frequency features from a given sound object. This function is adapted from default
819    Praat code to work with Parselmouth.
820
821    Args:
822        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
823        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
824        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
825        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
826
827    Returns:
828        dict: A dictionary containing the following keys:
829
830            - f1_mean (float): Mean F1 location.
831            - f1_std (float): Standard deviation of F1 location.
832            - b1_mean (float): Mean F1 bandwidth.
833            - b1_std (float): Standard deviation of F1 bandwidth.
834            - f2_mean (float): Mean F2 location.
835            - f2_std (float): Standard deviation of F2 location.
836            - b2_mean (float): Mean F2 bandwidth.
837            - b2_std (float): Standard deviation of F2 bandwidth.
838
839    Examples:
840        ```python
841        >>> snd = parselmouth.Sound("path_to_audio.wav")
842        >>> measureFormants(snd, 75, 500, 0.01)
843        {'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
844        'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
845        ```
846
847    Notes:
848        - Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
849        - Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
850        - Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
851        - Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the
852          sound where the vocal folds come together, helping to align the formant measurements precisely with the
853          pitch periods.
854        - Adapted from code at this [link](https://osf.io/6dwr3/).
855    """
856    if not PARSELMOUTH_AVAILABLE:
857        raise ModuleNotFoundError(
858            "`parselmouth` is not installed. "
859            "Please install senselab audio dependencies using `pip install senselab`."
860        )
861
862    try:
863        if not isinstance(snd, parselmouth.Sound):
864            snd = get_sound(snd)
865
866        # Extract formants
867        formants = parselmouth.praat.call(snd, "To Formant (burg)", frame_shift, 5, 5000, 0.025, 50)
868        # Key Hyperparameters: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Formant__burg____.html
869
870        # Extract pitch using CC method
871        pitch = snd.to_pitch_cc(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
872        pulses = parselmouth.praat.call([snd, pitch], "To PointProcess (cc)")
873
874        F1_list, F2_list, B1_list, B2_list = [], [], [], []
875        numPoints = parselmouth.praat.call(pulses, "Get number of points")
876
877        for point in range(1, numPoints + 1):
878            t = parselmouth.praat.call(pulses, "Get time from index", point)
879
880            F1_value = parselmouth.praat.call(formants, "Get value at time", 1, t, "Hertz", "Linear")
881            if not np.isnan(F1_value):
882                F1_list.append(F1_value)
883
884            B1_value = parselmouth.praat.call(formants, "Get bandwidth at time", 1, t, "Hertz", "Linear")
885            if not np.isnan(B1_value):
886                B1_list.append(B1_value)
887
888            F2_value = parselmouth.praat.call(formants, "Get value at time", 2, t, "Hertz", "Linear")
889            if not np.isnan(F2_value):
890                F2_list.append(F2_value)
891
892            B2_value = parselmouth.praat.call(formants, "Get bandwidth at time", 2, t, "Hertz", "Linear")
893            if not np.isnan(B2_value):
894                B2_list.append(B2_value)
895
896        f1_mean, f1_std = (np.mean(F1_list), np.std(F1_list)) if F1_list else (np.nan, np.nan)
897        b1_mean, b1_std = (np.mean(B1_list), np.std(B1_list)) if B1_list else (np.nan, np.nan)
898        f2_mean, f2_std = (np.mean(F2_list), np.std(F2_list)) if F2_list else (np.nan, np.nan)
899        b2_mean, b2_std = (np.mean(B2_list), np.std(B2_list)) if B2_list else (np.nan, np.nan)
900
901        return {
902            "f1_mean": f1_mean,
903            "f1_std": f1_std,
904            "b1_mean": b1_mean,
905            "b1_std": b1_std,
906            "f2_mean": f2_mean,
907            "f2_std": f2_std,
908            "b2_mean": b2_mean,
909            "b2_std": b2_std,
910        }
911
912    except Exception as e:
913        current_frame = inspect.currentframe()
914        if current_frame is not None:
915            current_function_name = current_frame.f_code.co_name
916            logger.error(f'Error in "{current_function_name}": \n' + str(e))
917        return {
918            "f1_mean": np.nan,
919            "f1_std": np.nan,
920            "b1_mean": np.nan,
921            "b1_std": np.nan,
922            "f2_mean": np.nan,
923            "f2_std": np.nan,
924            "b2_mean": np.nan,
925            "b2_std": np.nan,
926        }

Extract Formant Frequency Features.

Function to extract formant frequency features from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- f1_mean (float): Mean F1 location.
- f1_std (float): Standard deviation of F1 location.
- b1_mean (float): Mean F1 bandwidth.
- b1_std (float): Standard deviation of F1 bandwidth.
- f2_mean (float): Mean F2 location.
- f2_std (float): Standard deviation of F2 location.
- b2_mean (float): Mean F2 bandwidth.
- b2_std (float): Standard deviation of F2 bandwidth.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> measureFormants(snd, 75, 500, 0.01)
{'f1_mean': 500.0, 'f1_std': 50.0, 'b1_mean': 80.0, 'b1_std': 10.0, 'f2_mean': 1500.0,
'f2_std': 100.0, 'b2_mean': 120.0, 'b2_std': 20.0}
Notes:
  • Formants are the resonances of the vocal tract, determined by tongue placement and vocal tract shape.
  • Mean F1 typically varies between 300 to 750 Hz, while mean F2 typically varies between 900 to 2300 Hz.
  • Formant bandwidth is measured by taking the width of the band forming 3 dB down from the formant peak.
  • Formant extraction occurs per pitch period (pulses), meaning that the analysis identifies the points in the sound where the vocal folds come together, helping to align the formant measurements precisely with the pitch periods.
  • Adapted from code at this link.
def extract_spectral_moments( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float, window_size: float, frame_shift: float) -> Dict[str, float]:
 929def extract_spectral_moments(
 930    snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float, window_size: float, frame_shift: float
 931) -> Dict[str, float]:
 932    """Extract Spectral Moments.
 933
 934    Function to extract spectral moments from a given sound object. This function is adapted from default
 935    Praat code to work with Parselmouth.
 936
 937    Args:
 938        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
 939        floor (float): Minimum expected pitch value, set using value found in `pitch_values` function.
 940        ceiling (float): Maximum expected pitch value, set using value found in `pitch_values` function.
 941        window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
 942        frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
 943
 944    Returns:
 945        dict: A dictionary containing the following keys:
 946
 947            - spectral_gravity (float): Mean spectral gravity.
 948            - spectral_std_dev (float): Mean spectral standard deviation.
 949            - spectral_skewness (float): Mean spectral skewness.
 950            - spectral_kurtosis (float): Mean spectral kurtosis.
 951
 952    Examples:
 953        ```python
 954        >>> snd = parselmouth.Sound("path_to_audio.wav")
 955        >>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
 956        {'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
 957        ```
 958
 959    Notes:
 960        - Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency
 961        domain weighted by the power spectrum.
 962        - Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre
 963        of gravity.
 964        - Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different
 965        from the shape above the mean frequency.
 966        - Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different
 967          from a Gaussian shape.
 968        - Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
 969    """
 970    if not PARSELMOUTH_AVAILABLE:
 971        raise ModuleNotFoundError(
 972            "`parselmouth` is not installed. "
 973            "Please install senselab audio dependencies using `pip install senselab`."
 974        )
 975
 976    try:
 977        if not isinstance(snd, parselmouth.Sound):
 978            snd = get_sound(snd)
 979
 980        # Extract pitch object for voiced checking
 981        pitch = snd.to_pitch_ac(time_step=frame_shift, pitch_floor=floor, pitch_ceiling=ceiling)
 982
 983        # Calculate Spectrogram
 984        spectrogram = snd.to_spectrogram(window_length=window_size, time_step=frame_shift)
 985        # Using default settings other than window length and frame shift
 986        # Details: https://www.fon.hum.uva.nl/praat/manual/Sound__To_Spectrogram___.html
 987
 988        Gravity_list, STD_list, Skew_list, Kurt_list = [], [], [], []
 989
 990        num_steps = parselmouth.praat.call(spectrogram, "Get number of frames")
 991        for i in range(1, num_steps + 1):
 992            t = parselmouth.praat.call(spectrogram, "Get time from frame number", i)
 993            pitch_value = pitch.get_value_at_time(t)
 994
 995            if not np.isnan(pitch_value):
 996                voiced_spectrum = spectrogram.to_spectrum_slice(t)
 997                # Details: https://www.fon.hum.uva.nl/praat/manual/Spectrogram__To_Spectrum__slice____.html
 998
 999                Gravity_LLD = voiced_spectrum.get_centre_of_gravity(power=2)
1000                if not np.isnan(Gravity_LLD):
1001                    Gravity_list.append(Gravity_LLD)
1002
1003                STD_LLD = voiced_spectrum.get_standard_deviation(power=2)
1004                if not np.isnan(STD_LLD):
1005                    STD_list.append(STD_LLD)
1006
1007                Skew_LLD = voiced_spectrum.get_skewness(power=2)
1008                if not np.isnan(Skew_LLD):
1009                    Skew_list.append(Skew_LLD)
1010
1011                Kurt_LLD = voiced_spectrum.get_kurtosis(power=2)
1012                if not np.isnan(Kurt_LLD):
1013                    Kurt_list.append(Kurt_LLD)
1014
1015        gravity_mean = np.mean(Gravity_list) if Gravity_list else np.nan
1016        std_mean = np.mean(STD_list) if STD_list else np.nan
1017        skew_mean = np.mean(Skew_list) if Skew_list else np.nan
1018        kurt_mean = np.mean(Kurt_list) if Kurt_list else np.nan
1019
1020        return {
1021            "spectral_gravity": gravity_mean,
1022            "spectral_std_dev": std_mean,
1023            "spectral_skewness": skew_mean,
1024            "spectral_kurtosis": kurt_mean,
1025        }
1026
1027    except Exception as e:
1028        current_frame = inspect.currentframe()
1029        if current_frame is not None:
1030            current_function_name = current_frame.f_code.co_name
1031            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1032        return {
1033            "spectral_gravity": np.nan,
1034            "spectral_std_dev": np.nan,
1035            "spectral_skewness": np.nan,
1036            "spectral_kurtosis": np.nan,
1037        }

Extract Spectral Moments.

Function to extract spectral moments from a given sound object. This function is adapted from default Praat code to work with Parselmouth.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object or a file path or an Audio object.
  • floor (float): Minimum expected pitch value, set using value found in pitch_values function.
  • ceiling (float): Maximum expected pitch value, set using value found in pitch_values function.
  • window_size (float): Time frame over which the spectra is calculated, typically set to 25 ms.
  • frame_shift (float): Time rate at which to extract a new pitch value, typically set to 5 ms.
Returns:

dict: A dictionary containing the following keys:

- spectral_gravity (float): Mean spectral gravity.
- spectral_std_dev (float): Mean spectral standard deviation.
- spectral_skewness (float): Mean spectral skewness.
- spectral_kurtosis (float): Mean spectral kurtosis.
Examples:
>>> snd = parselmouth.Sound("path_to_audio.wav")
>>> extract_spectral_moments(snd, 75, 500, 0.025, 0.01)
{'spectral_gravity': 5000.0, 'spectral_std_dev': 150.0, 'spectral_skewness': -0.5, 'spectral_kurtosis': 3.0}
Notes:
  • Spectral Gravity: Measure for how high the frequencies in a spectrum are on average over the entire frequency domain weighted by the power spectrum.
  • Spectral Standard Deviation: Measure for how much the frequencies in a spectrum can deviate from the centre of gravity.
  • Spectral Skewness: Measure for how much the shape of the spectrum below the centre of gravity is different from the shape above the mean frequency.
  • Spectral Kurtosis: Measure for how much the shape of the spectrum around the centre of gravity is different from a Gaussian shape.
  • Details: https://www.fon.hum.uva.nl/praat/manual/Spectrum__Get_central_moment___.html
def extract_audio_duration( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio]) -> Dict[str, float]:
1043def extract_audio_duration(snd: Union[parselmouth.Sound, Path, Audio]) -> Dict[str, float]:
1044    """Get the duration of a given audio file or Audio object.
1045
1046    This function calculates the total duration of an audio file or audio object
1047    by creating a Parselmouth `Sound` object and then calling a Praat method
1048    to retrieve the duration of the audio in seconds.
1049
1050    Args:
1051        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
1052        a file path (Path), or an `Audio` object containing the audio waveform and
1053        its corresponding sampling rate.
1054
1055    Returns:
1056        Dict[str, float]: A dictionary containing:
1057            - "duration" (float): The total duration of the audio in seconds.
1058
1059    Raises:
1060        FileNotFoundError: If a provided file path does not exist.
1061
1062    Example:
1063        ```python
1064        >>> snd = Audio(waveform=[...], sampling_rate=16000)
1065        >>> extract_audio_duration(snd)
1066        {'duration': 5.23}
1067        ```
1068    """
1069    if not PARSELMOUTH_AVAILABLE:
1070        raise ModuleNotFoundError(
1071            "`parselmouth` is not installed. "
1072            "Please install senselab audio dependencies using `pip install senselab`."
1073        )
1074
1075    # Check if the input is a Path, in which case we load the audio from the file
1076    if not isinstance(snd, parselmouth.Sound):
1077        snd = get_sound(snd)
1078
1079    try:
1080        # Get the total duration of the sound
1081        duration = parselmouth.praat.call(snd, "Get total duration")
1082
1083        # Return the duration in a dictionary
1084        return {"duration": duration}
1085    except Exception as e:
1086        current_frame = inspect.currentframe()
1087        if current_frame is not None:
1088            current_function_name = current_frame.f_code.co_name
1089            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1090        return {"duration": np.nan}

Get the duration of a given audio file or Audio object.

This function calculates the total duration of an audio file or audio object by creating a Parselmouth Sound object and then calling a Praat method to retrieve the duration of the audio in seconds.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object,
  • a file path (Path), or an Audio object containing the audio waveform and
  • its corresponding sampling rate.
Returns:

Dict[str, float]: A dictionary containing: - "duration" (float): The total duration of the audio in seconds.

Raises:
  • FileNotFoundError: If a provided file path does not exist.
Example:
>>> snd = Audio(waveform=[...], sampling_rate=16000)
>>> extract_audio_duration(snd)
{'duration': 5.23}
def extract_jitter( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
1093def extract_jitter(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1094    """Returns the jitter descriptors for the given sound or audio file.
1095
1096    Args:
1097        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1098        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1099        floor (float): Minimum fundamental frequency (F0) in Hz.
1100        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1101
1102    Returns:
1103        Dict[str, float]: A dictionary containing various jitter measurements.
1104    """
1105
1106    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1107        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1108
1109    def _extract_jitter(type: str, point_process: parselmouth.Data) -> float:
1110        return parselmouth.praat.call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3)
1111
1112    if not PARSELMOUTH_AVAILABLE:
1113        raise ModuleNotFoundError(
1114            "`parselmouth` is not installed. "
1115            "Please install senselab audio dependencies using `pip install senselab`."
1116        )
1117
1118    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1119    if not isinstance(snd, parselmouth.Sound):
1120        snd = get_sound(snd)
1121
1122    try:
1123        # Convert the sound to a point process for jitter measurement
1124        point_process = _to_point_process(snd, floor, ceiling)
1125
1126        # Extract jitter measures from the point process
1127        return {
1128            "local_jitter": _extract_jitter("local", point_process),
1129            "localabsolute_jitter": _extract_jitter("local, absolute", point_process),
1130            "rap_jitter": _extract_jitter("rap", point_process),
1131            "ppq5_jitter": _extract_jitter("ppq5", point_process),
1132            "ddp_jitter": _extract_jitter("ddp", point_process),
1133        }
1134
1135    except Exception as e:
1136        current_frame = inspect.currentframe()
1137        if current_frame is not None:
1138            current_function_name = current_frame.f_code.co_name
1139            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1140        return {
1141            "local_jitter": np.nan,
1142            "localabsolute_jitter": np.nan,
1143            "rap_jitter": np.nan,
1144            "ppq5_jitter": np.nan,
1145            "ddp_jitter": np.nan,
1146        }

Returns the jitter descriptors for the given sound or audio file.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
  • or an Audio object containing the audio waveform and its corresponding sampling rate.
  • floor (float): Minimum fundamental frequency (F0) in Hz.
  • ceiling (float): Maximum fundamental frequency (F0) in Hz.
Returns:

Dict[str, float]: A dictionary containing various jitter measurements.

def extract_shimmer( snd: Union[parselmouth.Sound, pathlib.Path, senselab.audio.data_structures.audio.Audio], floor: float, ceiling: float) -> Dict[str, float]:
1149def extract_shimmer(snd: Union[parselmouth.Sound, Path, Audio], floor: float, ceiling: float) -> Dict[str, float]:
1150    """Returns the shimmer descriptors for the given sound or audio file.
1151
1152    Args:
1153        snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
1154        or an `Audio` object containing the audio waveform and its corresponding sampling rate.
1155        floor (float): Minimum fundamental frequency (F0) in Hz.
1156        ceiling (float): Maximum fundamental frequency (F0) in Hz.
1157
1158    Returns:
1159        Dict[str, float]: A dictionary containing various shimmer measurements.
1160    """
1161
1162    def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
1163        return parselmouth.praat.call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
1164
1165    def _extract_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float:
1166        return parselmouth.praat.call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6)
1167
1168    if not PARSELMOUTH_AVAILABLE:
1169        raise ModuleNotFoundError(
1170            "`parselmouth` is not installed. "
1171            "Please install senselab audio dependencies using `pip install senselab`."
1172        )
1173
1174    # Check if the input is a Path or Audio, and convert to Parselmouth Sound if necessary
1175    if not isinstance(snd, parselmouth.Sound):
1176        snd = get_sound(snd)
1177
1178    try:
1179        # Convert the sound to a point process for shimmer measurement
1180        point_process = _to_point_process(snd, floor, ceiling)
1181
1182        # Extract shimmer measures from the sound and point process
1183        return {
1184            "local_shimmer": _extract_shimmer("local", snd, point_process),
1185            "localDB_shimmer": _extract_shimmer("local_dB", snd, point_process),
1186            "apq3_shimmer": _extract_shimmer("apq3", snd, point_process),
1187            "apq5_shimmer": _extract_shimmer("apq5", snd, point_process),
1188            "apq11_shimmer": _extract_shimmer("apq11", snd, point_process),
1189            "dda_shimmer": _extract_shimmer("dda", snd, point_process),
1190        }
1191
1192    except Exception as e:
1193        current_frame = inspect.currentframe()
1194        if current_frame is not None:
1195            current_function_name = current_frame.f_code.co_name
1196            logger.error(f'Error in "{current_function_name}": \n' + str(e))
1197        return {
1198            "local_shimmer": np.nan,
1199            "localDB_shimmer": np.nan,
1200            "apq3_shimmer": np.nan,
1201            "apq5_shimmer": np.nan,
1202            "apq11_shimmer": np.nan,
1203            "dda_shimmer": np.nan,
1204        }

Returns the shimmer descriptors for the given sound or audio file.

Arguments:
  • snd (Union[parselmouth.Sound, Path, Audio]): A Parselmouth Sound object, a file path (Path),
  • or an Audio object containing the audio waveform and its corresponding sampling rate.
  • floor (float): Minimum fundamental frequency (F0) in Hz.
  • ceiling (float): Maximum fundamental frequency (F0) in Hz.
Returns:

Dict[str, float]: A dictionary containing various shimmer measurements.

def extract_praat_parselmouth_features_from_audios( audios: List[senselab.audio.data_structures.audio.Audio], time_step: float = 0.005, window_length: float = 0.025, pitch_unit: str = 'Hertz', speech_rate: bool = True, intensity_descriptors: bool = True, harmonicity_descriptors: bool = True, formants: bool = True, spectral_moments: bool = True, pitch: bool = True, slope_tilt: bool = True, cpp_descriptors: bool = True, duration: bool = True, jitter: bool = True, shimmer: bool = True) -> List[Dict[str, Any]]:
1208def extract_praat_parselmouth_features_from_audios(
1209    audios: List[Audio],
1210    time_step: float = 0.005,
1211    window_length: float = 0.025,
1212    pitch_unit: str = "Hertz",
1213    speech_rate: bool = True,
1214    intensity_descriptors: bool = True,
1215    harmonicity_descriptors: bool = True,
1216    formants: bool = True,
1217    spectral_moments: bool = True,
1218    pitch: bool = True,
1219    slope_tilt: bool = True,
1220    cpp_descriptors: bool = True,
1221    duration: bool = True,
1222    jitter: bool = True,
1223    shimmer: bool = True,
1224) -> List[Dict[str, Any]]:
1225    """Extract features from a list of Audio objects and return a JSON-like dictionary.
1226
1227    Args:
1228        audios (list): List of Audio objects to extract features from.
1229        pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
1230        time_step (float): Time rate at which to extract features. Defaults to 0.005.
1231        window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
1232        speech_rate (bool): Whether to extract speech rate. Defaults to True.
1233        intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
1234        harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
1235        formants (bool): Whether to extract formants. Defaults to True.
1236        spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
1237        pitch (bool): Whether to extract pitch. Defaults to True.
1238        slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
1239        cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
1240        duration (bool): Whether to extract duration. Defaults to True.
1241        jitter (bool): Whether to extract jitter. Defaults to True.
1242        shimmer (bool): Whether to extract shimmer. Defaults to True.
1243
1244    Returns:
1245        dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".
1246    """
1247    extracted_data: List[Dict[str, Any]] = []
1248
1249    for snd in audios:
1250        # --- shared precomputations ---
1251        pitch_values_out = extract_pitch_values(snd=snd)
1252        pitch_floor = pitch_values_out["pitch_floor"]
1253        pitch_ceiling = pitch_values_out["pitch_ceiling"]
1254
1255        # Precompute blocks conditionally
1256        speech_rate_out = extract_speech_rate(snd=snd) if speech_rate else None
1257        pitch_out = (
1258            extract_pitch_descriptors(
1259                snd=snd,
1260                floor=pitch_floor,
1261                ceiling=pitch_ceiling,
1262                frame_shift=time_step,
1263                unit=pitch_unit,
1264            )
1265            if pitch
1266            else None
1267        )
1268        intensity_out = (
1269            extract_intensity_descriptors(
1270                snd=snd,
1271                floor=pitch_floor,
1272                frame_shift=time_step,
1273            )
1274            if intensity_descriptors
1275            else None
1276        )
1277        harmonicity_out = (
1278            extract_harmonicity_descriptors(
1279                snd=snd,
1280                floor=pitch_floor,
1281                frame_shift=time_step,
1282            )
1283            if harmonicity_descriptors
1284            else None
1285        )
1286        formants_out = (
1287            measure_f1f2_formants_bandwidths(
1288                snd=snd,
1289                floor=pitch_floor,
1290                ceiling=pitch_ceiling,
1291                frame_shift=time_step,
1292            )
1293            if formants
1294            else None
1295        )
1296        spectral_moments_out = (
1297            extract_spectral_moments(
1298                snd=snd,
1299                floor=pitch_floor,
1300                ceiling=pitch_ceiling,
1301                window_size=window_length,
1302                frame_shift=time_step,
1303            )
1304            if spectral_moments
1305            else None
1306        )
1307        slope_tilt_out = (
1308            extract_slope_tilt(
1309                snd=snd,
1310                floor=pitch_floor,
1311                ceiling=pitch_ceiling,
1312            )
1313            if slope_tilt
1314            else None
1315        )
1316        cpp_out = (
1317            extract_cpp_descriptors(
1318                snd=snd,
1319                floor=pitch_floor,
1320                ceiling=pitch_ceiling,
1321                frame_shift=time_step,
1322            )
1323            if cpp_descriptors
1324            else None
1325        )
1326        audio_duration_out = extract_audio_duration(snd=snd) if duration else None
1327        jitter_out = (
1328            extract_jitter(
1329                snd=snd,
1330                floor=pitch_floor,
1331                ceiling=pitch_ceiling,
1332            )
1333            if jitter
1334            else None
1335        )
1336        shimmer_out = (
1337            extract_shimmer(
1338                snd=snd,
1339                floor=pitch_floor,
1340                ceiling=pitch_ceiling,
1341            )
1342            if shimmer
1343            else None
1344        )
1345
1346        # --- collect outputs ---
1347        feature_data: Dict[str, Any] = {}
1348
1349        if duration and audio_duration_out is not None:
1350            feature_data["duration"] = audio_duration_out["duration"]
1351
1352        if speech_rate and speech_rate_out is not None:
1353            feature_data["speaking_rate"] = speech_rate_out["speaking_rate"]
1354            feature_data["articulation_rate"] = speech_rate_out["articulation_rate"]
1355            feature_data["phonation_ratio"] = speech_rate_out["phonation_ratio"]
1356            feature_data["pause_rate"] = speech_rate_out["pause_rate"]
1357            feature_data["mean_pause_duration"] = speech_rate_out["mean_pause_dur"]
1358
1359        if pitch and pitch_out is not None:
1360            unit_l = pitch_unit.lower()
1361            feature_data[f"mean_f0_{unit_l}"] = pitch_out[f"mean_f0_{unit_l}"]
1362            feature_data[f"std_f0_{unit_l}"] = pitch_out[f"stdev_f0_{unit_l}"]
1363
1364        if intensity_descriptors and intensity_out is not None:
1365            feature_data["mean_intensity_db"] = intensity_out["mean_db"]
1366            feature_data["std_intensity_db"] = intensity_out["std_db"]
1367            feature_data["range_ratio_intensity_db"] = intensity_out["range_db_ratio"]
1368
1369        if harmonicity_descriptors and harmonicity_out is not None:
1370            feature_data["mean_hnr_db"] = harmonicity_out["hnr_db_mean"]
1371            feature_data["std_hnr_db"] = harmonicity_out["hnr_db_std_dev"]
1372
1373        if slope_tilt and slope_tilt_out is not None:
1374            feature_data["spectral_slope"] = slope_tilt_out["spectral_slope"]
1375            feature_data["spectral_tilt"] = slope_tilt_out["spectral_tilt"]
1376
1377        if cpp_descriptors and cpp_out is not None:
1378            feature_data["cepstral_peak_prominence_mean"] = cpp_out["mean_cpp"]
1379            feature_data["cepstral_peak_prominence_std"] = cpp_out["std_dev_cpp"]
1380
1381        if formants and formants_out is not None:
1382            feature_data["mean_f1_loc"] = formants_out["f1_mean"]
1383            feature_data["std_f1_loc"] = formants_out["f1_std"]
1384            feature_data["mean_b1_loc"] = formants_out["b1_mean"]
1385            feature_data["std_b1_loc"] = formants_out["b1_std"]
1386            feature_data["mean_f2_loc"] = formants_out["f2_mean"]
1387            feature_data["std_f2_loc"] = formants_out["f2_std"]
1388            feature_data["mean_b2_loc"] = formants_out["b2_mean"]
1389            feature_data["std_b2_loc"] = formants_out["b2_std"]
1390
1391        if spectral_moments and spectral_moments_out is not None:
1392            feature_data["spectral_gravity"] = spectral_moments_out["spectral_gravity"]
1393            feature_data["spectral_std_dev"] = spectral_moments_out["spectral_std_dev"]
1394            feature_data["spectral_skewness"] = spectral_moments_out["spectral_skewness"]
1395            feature_data["spectral_kurtosis"] = spectral_moments_out["spectral_kurtosis"]
1396
1397        if jitter and jitter_out is not None:
1398            feature_data["local_jitter"] = jitter_out["local_jitter"]
1399            feature_data["localabsolute_jitter"] = jitter_out["localabsolute_jitter"]
1400            feature_data["rap_jitter"] = jitter_out["rap_jitter"]
1401            feature_data["ppq5_jitter"] = jitter_out["ppq5_jitter"]
1402            feature_data["ddp_jitter"] = jitter_out["ddp_jitter"]
1403
1404        if shimmer and shimmer_out is not None:
1405            feature_data["local_shimmer"] = shimmer_out["local_shimmer"]
1406            feature_data["localDB_shimmer"] = shimmer_out["localDB_shimmer"]
1407            feature_data["apq3_shimmer"] = shimmer_out["apq3_shimmer"]
1408            feature_data["apq5_shimmer"] = shimmer_out["apq5_shimmer"]
1409            feature_data["apq11_shimmer"] = shimmer_out["apq11_shimmer"]
1410            feature_data["dda_shimmer"] = shimmer_out["dda_shimmer"]
1411
1412        extracted_data.append(feature_data)
1413
1414    return extracted_data

Extract features from a list of Audio objects and return a JSON-like dictionary.

Arguments:
  • audios (list): List of Audio objects to extract features from.
  • pitch_unit (str): Unit for pitch measurements. Defaults to "Hertz".
  • time_step (float): Time rate at which to extract features. Defaults to 0.005.
  • window_length (float): Window length in seconds for spectral features. Defaults to 0.025.
  • speech_rate (bool): Whether to extract speech rate. Defaults to True.
  • intensity_descriptors (bool): Whether to extract intensity descriptors. Defaults to True.
  • harmonicity_descriptors (bool): Whether to extract harmonic descriptors. Defaults to True.
  • formants (bool): Whether to extract formants. Defaults to True.
  • spectral_moments (bool): Whether to extract spectral moments. Defaults to True.
  • pitch (bool): Whether to extract pitch. Defaults to True.
  • slope_tilt (bool): Whether to extract slope and tilt. Defaults to True.
  • cpp_descriptors (bool): Whether to extract CPP descriptors. Defaults to True.
  • duration (bool): Whether to extract duration. Defaults to True.
  • jitter (bool): Whether to extract jitter. Defaults to True.
  • shimmer (bool): Whether to extract shimmer. Defaults to True.
Returns:

dict: A JSON-like dictionary with extracted features structured under "praat_parselmouth".