Source code for variationist.metrics.corpus_statistics

"""Functions for calculating a series of statistics for a given corpus."""

import pandas as pd
from itertools import islice
from statistics import stdev, mean
from tqdm import tqdm

from variationist.metrics import shared_metrics


[docs]def take(n, iterable): """Return the first n items of the iterable as a list.""" return list(islice(iterable, n))
[docs]def number_of_texts(label_values_dict, subsets_of_interest): """Returns a dictionary with how many texts are in each subset of interest. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. Returns ------- values_dict: Dict A dict containing the length of each subset. """ values_dict = dict() for column in label_values_dict: for l in range(len(label_values_dict[column])): curr_label = subsets_of_interest[column][l].name values_dict[curr_label] = len(subsets_of_interest[column][l]) return values_dict
[docs]def average_text_length(label_values_dict, subsets_of_interest): """Returns a dictionary with the average length of texts in each subset of interest. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. Returns ------- values_dict: Dict A dict containing the average length (and its standard deviation) of texts in each subset. """ values_dict = dict() for column in label_values_dict: for l in range(len(label_values_dict[column])): values_list = [] curr_label = subsets_of_interest[column][l].name for text in subsets_of_interest[column][l]: if len(text) == 0: continue values_list.append(len(text)) values_dict[curr_label] = dict() if len(values_list) == 0: values_dict[curr_label]["mean"] = 0 else: values_dict[curr_label]["mean"] = mean(values_list) if len(values_list) < 2: values_dict[curr_label]["stdev"] = 0 else: values_dict[curr_label]["stdev"] = stdev(values_list) return values_dict
[docs]def num_tokens(label_values_dict, subsets_of_interest): """Returns a dictionary with the total number of tokens in each subset. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. Returns ------- n_word_dict: Dict A dict containing the total number of tokens in each subset.""" n_word_dict = dict() for column in label_values_dict: for l in range(len(label_values_dict[column])): curr_label = subsets_of_interest[column][l].name n_word_dict[curr_label] = 0 for text in subsets_of_interest[column][l]: if len(text) == 0: continue n_word_dict[curr_label] = n_word_dict[curr_label]+(len(text)) return n_word_dict
[docs]def vocab_size(label_values_dict, subsets_of_interest): """Returns a dictionary with the total number of unique tokens in each subset - i.e. the size of the vocabulary for each subset. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. Returns ------- vocab_dict: Dict A dict containing the vocabulary size of each subset.""" vocab_dict = dict() for column in label_values_dict: for l in range(len(label_values_dict[column])): curr_label = subsets_of_interest[column][l].name vocab_dict[curr_label] = set() for text in subsets_of_interest[column][l]: if len(text) == 0: continue vocab_dict[curr_label].update(text) vocab_dict[curr_label] = len(vocab_dict[curr_label]) return vocab_dict
[docs]def number_of_duplicates(label_values_dict, subsets_of_interest): """Returns a dictionary with the number of duplicate texts in each subset of interest. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. Returns ------- duplicates_dict: Dict A dict containing the number of duplicate texts in each subset.""" duplicates_dict = dict() for column in label_values_dict: for l in range(len(label_values_dict[column])): text_dic = dict() duplicates = 0 curr_label = subsets_of_interest[column][l].name for text in subsets_of_interest[column][l]: if len(text) == 0: continue if " ".join(text) in text_dic: duplicates += 1 text_dic[" ".join(text)] = "" duplicates_dict[curr_label] = duplicates return duplicates_dict
[docs]def create_frequency_dictionary(label_values_dict, subsets_of_interest, args): """Returns a dictionary with the frequency of tokens in each subset of interest. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. args: InspectorArgs The arguments selected by the user. Returns ------- output_freqs: Dict A dict containing the frequency of each token for each subset of interest.""" output_freqs = dict() for column in label_values_dict: for l in tqdm(range(len(label_values_dict[column]))): curr_label = subsets_of_interest[column][l].name mydict = shared_metrics.get_all_frequencies(subsets_of_interest[column][l]) sorted_mydict = sorted(mydict.items(), key=lambda x:x[1], reverse=True) converted_dict = dict(sorted_mydict) output_freqs[curr_label] = converted_dict # print("most frequent", curr_label, take(10, converted_dict.items())) #print for debug return output_freqs
[docs]def compute_basic_stats(label_values_dict, subsets_of_interest, args): """A wrapper function for calling all of the basic statistics functions. Parameters ---------- label_values_dict: Dict A dictionary containing all of the possible values each variable can take in the input dataset. subsets_of_interest: Dict A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user. args: InspectorArgs The arguments selected by the user. Returns ------- stats_dict: Dict A dict containing the calculated statistics. """ stats_dict = dict() for stat in ["num_texts", "avg_text_len", "num_tokens", "vocab_size", "num_duplicates"]: stats_dict[stat] = {} stats_dict["num_texts"][list(label_values_dict.keys())[0]] = number_of_texts(label_values_dict, subsets_of_interest) stats_dict["avg_text_len"][list(label_values_dict.keys())[0]] = average_text_length(label_values_dict, subsets_of_interest) stats_dict["num_tokens"][list(label_values_dict.keys())[0]] = num_tokens(label_values_dict, subsets_of_interest) stats_dict["vocab_size"][list(label_values_dict.keys())[0]] = vocab_size(label_values_dict, subsets_of_interest) stats_dict["num_duplicates"][list(label_values_dict.keys())[0]] = number_of_duplicates(label_values_dict, subsets_of_interest) # print(stats_dict) return stats_dict