Source code for variationist.metrics.lexical_variation

import math
from statistics import stdev, mean
from tqdm import tqdm


[docs]def safe_divide(numerator, denominator):
    """Utility function to avoid zero division errors."""
    if denominator == 0 or denominator == 0.0:
        result = 0
    else:
        result = numerator / denominator

    return result


[docs]def ttr(label_values_dict, subsets_of_interest, args):
    """Calculates Type Token Ratio.
    
    Parameters
    ----------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the input dataset.
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
    args: InspectorArgs
        The arguments selected by the user.
    
    Returns
    -------
    values_dict: Dict
        A dictionary with the mean TTR score for each subset and its standard deviation.
    """
    values_dict = dict()
    for column in label_values_dict:
        for l in tqdm(range(len(label_values_dict[column]))):
            values_list = []
            curr_label = subsets_of_interest[column][l].name
            for sentence in subsets_of_interest[column][l]:
                if len(sentence) == 0:
                    continue
                tok = len(sentence)
                typ = len(list(dict.fromkeys(sentence)))
                values_list.append(safe_divide(typ,tok))
            values_dict[curr_label] = dict()
            if len(values_list) == 0:
                values_dict[curr_label]["mean"] = 0
            else:
                values_dict[curr_label]["mean"] = mean(values_list)
            if len(values_list) < 2:
                values_dict[curr_label]["stdev"] = 0
            else:
                values_dict[curr_label]["stdev"] = stdev(values_list)

    # print("TTR: ",values_dict)

    return values_dict


[docs]def rttr(label_values_dict, subsets_of_interest, args):
    """Calculates Root Type Token Ratio.
    
    Parameters
    ----------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the input dataset.
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
    args: InspectorArgs
        The arguments selected by the user.
    
    Returns
    -------
    values_dict: Dict
        A dictionary with the mean RTTR score for each subset and its standard deviation.
    """
    values_dict = dict()
    for column in label_values_dict:
        for l in tqdm(range(len(label_values_dict[column]))):
            values_list = []
            curr_label = subsets_of_interest[column][l].name
            for sentence in subsets_of_interest[column][l]:
                if len(sentence) == 0:
                    continue
                tok = len(sentence)
                typ = len(list(dict.fromkeys(sentence)))
                values_list.append(safe_divide(typ,math.sqrt(tok)))
            values_dict[curr_label] = dict()
            if len(values_list) == 0:
                values_dict[curr_label]["mean"] = 0
            else:
                values_dict[curr_label]["mean"] = mean(values_list)
            if len(values_list) < 2:
                values_dict[curr_label]["stdev"] = 0
            else:
                values_dict[curr_label]["stdev"] = stdev(values_list)

    # print("RTTR: ",values_dict)

    return values_dict


[docs]def maas(label_values_dict, subsets_of_interest, args):
    """Calculates Maas's index (Maas, 1972).
    
    Parameters
    ----------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the input dataset.
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
    args: InspectorArgs
        The arguments selected by the user.
    
    Returns
    -------
    values_dict: Dict
        A dictionary with the mean Maas index score for each subset and its standard deviation.
    """
    values_dict = dict()
    for column in label_values_dict:
        for l in tqdm(range(len(label_values_dict[column]))):
            values_list = []
            curr_label = subsets_of_interest[column][l].name
            for sentence in subsets_of_interest[column][l]:
                if len(sentence) == 0:
                    continue
                tok = len(sentence)
                typ = len(list(dict.fromkeys(sentence)))
                values_list.append(safe_divide((math.log10(tok)-math.log10(typ)), math.pow(math.log10(tok),2)))
            values_dict[curr_label] = dict()
            if len(values_list) == 0:
                values_dict[curr_label]["mean"] = 0
            else:
                values_dict[curr_label]["mean"] = mean(values_list)
            if len(values_list) < 2:
                values_dict[curr_label]["stdev"] = 0
            else:
                values_dict[curr_label]["stdev"] = stdev(values_list)

    # print("MAAS: ",values_dict)

    return values_dict


[docs]def lttr(label_values_dict, subsets_of_interest, args):
    """Calculates Log Type Token Ratio.
    
    Parameters
    ----------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the input dataset.
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
    args: InspectorArgs
        The arguments selected by the user.
    
    Returns
    -------
    values_dict: Dict
        A dictionary with the mean LTTR score for each subset and its standard deviation.
    """
    values_dict = dict()
    for column in label_values_dict:
        for l in tqdm(range(len(label_values_dict[column]))):
            values_list = []
            curr_label = subsets_of_interest[column][l].name
            for sentence in subsets_of_interest[column][l]:
                if len(sentence) == 0:
                    continue
                tok = len(sentence)
                typ = len(list(dict.fromkeys(sentence)))
                values_list.append(safe_divide(math.log10(typ), math.log10(tok)))
            values_dict[curr_label] = dict()
            if len(values_list) == 0:
                values_dict[curr_label]["mean"] = 0
            else:
                values_dict[curr_label]["mean"] = mean(values_list)
            if len(values_list) < 2:
                values_dict[curr_label]["stdev"] = 0
            else:
                values_dict[curr_label]["stdev"] = stdev(values_list)

    # print("LTTR: ",values_dict)
    
    return values_dict