"""Functions for calculating a series of statistics for a given corpus."""
import pandas as pd
from itertools import islice
from statistics import stdev, mean
from tqdm import tqdm
from variationist.metrics import shared_metrics
[docs]def take(n, iterable):
"""Return the first n items of the iterable as a list."""
return list(islice(iterable, n))
[docs]def number_of_texts(label_values_dict, subsets_of_interest):
"""Returns a dictionary with how many texts are in each subset of interest.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
Returns
-------
values_dict: Dict
A dict containing the length of each subset.
"""
values_dict = dict()
for column in label_values_dict:
for l in range(len(label_values_dict[column])):
curr_label = subsets_of_interest[column][l].name
values_dict[curr_label] = len(subsets_of_interest[column][l])
return values_dict
[docs]def average_text_length(label_values_dict, subsets_of_interest):
"""Returns a dictionary with the average length of texts in each subset of interest.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
Returns
-------
values_dict: Dict
A dict containing the average length (and its standard deviation) of texts in each subset.
"""
values_dict = dict()
for column in label_values_dict:
for l in range(len(label_values_dict[column])):
values_list = []
curr_label = subsets_of_interest[column][l].name
for text in subsets_of_interest[column][l]:
if len(text) == 0:
continue
values_list.append(len(text))
values_dict[curr_label] = dict()
if len(values_list) == 0:
values_dict[curr_label]["mean"] = 0
else:
values_dict[curr_label]["mean"] = mean(values_list)
if len(values_list) < 2:
values_dict[curr_label]["stdev"] = 0
else:
values_dict[curr_label]["stdev"] = stdev(values_list)
return values_dict
[docs]def num_tokens(label_values_dict, subsets_of_interest):
"""Returns a dictionary with the total number of tokens in each subset.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
Returns
-------
n_word_dict: Dict
A dict containing the total number of tokens in each subset."""
n_word_dict = dict()
for column in label_values_dict:
for l in range(len(label_values_dict[column])):
curr_label = subsets_of_interest[column][l].name
n_word_dict[curr_label] = 0
for text in subsets_of_interest[column][l]:
if len(text) == 0:
continue
n_word_dict[curr_label] = n_word_dict[curr_label]+(len(text))
return n_word_dict
[docs]def vocab_size(label_values_dict, subsets_of_interest):
"""Returns a dictionary with the total number of unique tokens in each subset - i.e. the size of the vocabulary for each subset.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
Returns
-------
vocab_dict: Dict
A dict containing the vocabulary size of each subset."""
vocab_dict = dict()
for column in label_values_dict:
for l in range(len(label_values_dict[column])):
curr_label = subsets_of_interest[column][l].name
vocab_dict[curr_label] = set()
for text in subsets_of_interest[column][l]:
if len(text) == 0:
continue
vocab_dict[curr_label].update(text)
vocab_dict[curr_label] = len(vocab_dict[curr_label])
return vocab_dict
[docs]def number_of_duplicates(label_values_dict, subsets_of_interest):
"""Returns a dictionary with the number of duplicate texts in each subset of interest.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
Returns
-------
duplicates_dict: Dict
A dict containing the number of duplicate texts in each subset."""
duplicates_dict = dict()
for column in label_values_dict:
for l in range(len(label_values_dict[column])):
text_dic = dict()
duplicates = 0
curr_label = subsets_of_interest[column][l].name
for text in subsets_of_interest[column][l]:
if len(text) == 0:
continue
if " ".join(text) in text_dic:
duplicates += 1
text_dic[" ".join(text)] = ""
duplicates_dict[curr_label] = duplicates
return duplicates_dict
[docs]def create_frequency_dictionary(label_values_dict, subsets_of_interest, args):
"""Returns a dictionary with the frequency of tokens in each subset of interest.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
args: InspectorArgs
The arguments selected by the user.
Returns
-------
output_freqs: Dict
A dict containing the frequency of each token for each subset of interest."""
output_freqs = dict()
for column in label_values_dict:
for l in tqdm(range(len(label_values_dict[column]))):
curr_label = subsets_of_interest[column][l].name
mydict = shared_metrics.get_all_frequencies(subsets_of_interest[column][l])
sorted_mydict = sorted(mydict.items(), key=lambda x:x[1], reverse=True)
converted_dict = dict(sorted_mydict)
output_freqs[curr_label] = converted_dict
# print("most frequent", curr_label, take(10, converted_dict.items())) #print for debug
return output_freqs
[docs]def compute_basic_stats(label_values_dict, subsets_of_interest, args):
"""A wrapper function for calling all of the basic statistics functions.
Parameters
----------
label_values_dict: Dict
A dictionary containing all of the possible values each variable can take in the input dataset.
subsets_of_interest: Dict
A dictionary containing a pandas series with tokenized texts for each variable/text column combination out of the variables and text columns specified by the user.
args: InspectorArgs
The arguments selected by the user.
Returns
-------
stats_dict: Dict
A dict containing the calculated statistics.
"""
stats_dict = dict()
for stat in ["num_texts",
"avg_text_len",
"num_tokens",
"vocab_size",
"num_duplicates"]:
stats_dict[stat] = {}
stats_dict["num_texts"][list(label_values_dict.keys())[0]] = number_of_texts(label_values_dict, subsets_of_interest)
stats_dict["avg_text_len"][list(label_values_dict.keys())[0]] = average_text_length(label_values_dict, subsets_of_interest)
stats_dict["num_tokens"][list(label_values_dict.keys())[0]] = num_tokens(label_values_dict, subsets_of_interest)
stats_dict["vocab_size"][list(label_values_dict.keys())[0]] = vocab_size(label_values_dict, subsets_of_interest)
stats_dict["num_duplicates"][list(label_values_dict.keys())[0]] = number_of_duplicates(label_values_dict, subsets_of_interest)
# print(stats_dict)
return stats_dict