Source code for variationist.data.preprocess_utils

import itertools
import os
import pandas as pd
import re
import stopwordsiso as stopwords
import sys
from tqdm import tqdm

from variationist import utils
from variationist.data import tokenization_utils


[docs]def remove_elements(token_list, stopwords):
    """"
    Used for removing stopwords. Given a token array, it will return the same array 
    excluding the elements in stopwords. Used to remove stopwords at the text level.
    
    Parameters
    ----------
    token_list: `Iterable`
        An array of tokens.
    stopwords: `Iterable`
        Array of stopwords to be removed from token_list.
        
    Returns
    -------
    new_array: `Iterable`
        The same array, with stopwords removed.
    """

    new_array = []
    for element in token_list:
        if element.lower() not in stopwords:
            new_array.append(element)

    return new_array


[docs]def remove_stopwords(text_column, language, custom_stopwords):
    """"
    Used for removing stopwords. Given an already tokenized pandas Series of texts, it will return the same series, excluding the elements in stopwords. Used to remove stopwords at the column level.
    
    Parameters
    ----------
    token_column: pandas.Series
        A series containing the already tokenized texts.
    language: str
        The language we should retrieve stopwords for.
    custom_stopwords: `str` or `List`, *optional*
        A list of stopwords (or a path to a file containing stopwords, one per line) to be removed before tokenization. If `stopwords` is True, these stopwords will be added to that list. Will default to None.
        
    Returns
    -------
    text_column: pandas.Series
        The same tokenized series as input, with stopwords removed.
    """

    lang_stopwords = []

    # Language need to be ISO 639-1 (two-letter codes, e.g., en, it, fr, de, etc.) 
    if language != None:
        lang_stopwords = stopwords.stopwords(language)
    
    if custom_stopwords != None:
        extra_stopwords = get_custom_stopword_list(custom_stopwords)
        lang_stopwords.update(extra_stopwords)

    text_column = text_column.squeeze().apply(lambda x: remove_elements(x, lang_stopwords))

    return text_column


[docs]def get_custom_stopword_list(custom_stopwords):
    """
    Function that returns a list of stopwords from a file (one stopword per line)
    or returns the list itself
    
    Parameters
    ----------
    custom_stopwords: `str` or `List`, *optional*
        A list of stopwords (or a path to a file containing stopwords, one per line) to be removed before tokenization. If `stopwords` is True, these stopwords will be added to that list. Will default to None.
        
    Returns
    -------
    extra_stopwords: List
        A list including the custom stopwords.
    """

    if (type(custom_stopwords) == list):
        extra_stopwords = custom_stopwords
    else:
        extra_stopwords = []
        with open(custom_stopwords, "r") as f:
            for line in f:
                extra_stopwords.append(line.rstrip("\n"))

    return extra_stopwords


[docs]def convert_to_ngrams(token_list, n_tokens):
    """
    Function for creating n-grams from tokens. Given a list of tokens and the number 
    of tokens for the n-grams, it returns the same list, but with n-grams as units 
    instead of single tokens. Used to create n-grams at the text level.
    
    Parameters
    ----------
    token_list: Iterable
        An array of tokens.
    n_tokens: int
        The n to use for n-grams. E.g., a value of 2 will result in bi-grams.
        
    Returns
    -------
    new_array: Iterable
        The same array, with n-grams instead of single tokens as units.
    """

    new_array = []
    for i in range(len(token_list) - n_tokens + 1):
        new_array.append(" ".join(token_list[i: i + n_tokens]))
    
    return new_array


[docs]def create_tokenized_ngrams_column(tokenized_text_column, n_tokens):
    """
    Function for creating n-grams from tokens. Given an already tokenized pandas 
    Series of texts, it will return the same series, but with n-grams as units 
    instead of single tokens. Used to create n-grams at the text column level.
    
    Parameters
    ----------
    tokenized_text_column: pandas.Series
        A series containing the already tokenized texts.
    n_tokens: int
        The n to use for n-grams. E.g., a value of 2 will result in bi-grams.
        
    Returns
    -------
    new_array: Iterable
        The same array, with n-grams instead of single tokens as units.
    """

    tqdm.pandas()
    tokenized_text_column = tokenized_text_column.squeeze().progress_apply(lambda x: convert_to_ngrams(x,n_tokens))
    
    return tokenized_text_column


# @TODO this will be developed in a future release
# def discretize_granularity(dataframe, var_names, var_types, var_semantics, var_granularity):
#     for i in range(len(var_names)):
#         if var_granularity != None:
#             break
#     # then we will map (check pandas docs)
#     return dataframe


[docs]def discretize_bins_col(dataframe_var_col, curr_var_bins):
    """
    A function that will split a variable into bins, assigning new values to that 
    variable based on how many bins were selected by the user with the var_bins 
    parameter in InspectorArgs.
    
    Parameters
    ----------
    dataframe_var_col: pandas.Series
        A pandas Series, corresponding to the pandas Dataframe column containing 
        the variable that should be divided into bins.
    curr_var_bins: int
        The number of bins to divide the current variable into, as specified by 
        the user using var_bins.
    
    Returns
    -------
    discretized_var_col: pandas.Series
        The same Series as input, but with values split into bins.
    """
    
    discretized_var_col, bin_names = pd.cut(dataframe_var_col,
                                            bins=curr_var_bins,
                                            retbins=True)
    print(f"""INFO: The calculated cutoff values of bins for the {dataframe_var_col.name} variable are:\n{list(bin_names)}\nThese will be reported as (value_x, value_x+1] in the results.""")
    
    return discretized_var_col


[docs]def extract_combinations(token_list, n_items, context_window, unique_cooc):
    """A Function that will extract co-occurrences from tokens if this was set by 
    the user. Used to extract co-occurrences at the text level.
    
    Parameters
    ----------
    token_list: Iterable
        An array of tokens for the text, out of which to extract co-occurrences.
    n_items: int
        The number of co-occurring tokens we should consider. Corresponds to `n_cooc` 
        set by the user in InspectorArgs.
    context_window: int
        Size of the context window for co-occurrences, corresponding to `cooc_window_size` 
        in InspectorArgs.
    unique_cooc: bool
        A boolean for whether to consider unique co-occurrences. If True, multiple 
        occurrences of the same token in a text will be discarded.

    Returns
    -------
    new_array: List
        returns the new array of tokens, with co-occurrences as basic units rather than 
        the original tokens.
    """

    # Returns token_list merged into cooccurrences
    if context_window == 0:
        context_window = len(token_list)
    new_array = []
    for i in range(len(token_list) - context_window + 1):
        for cooc in itertools.combinations(token_list[i: i + context_window], n_items):
            if (not unique_cooc) or ((unique_cooc) and (len(set(cooc)) == len(cooc))):
                new_array.append(" ".join(sorted(cooc)))
    new_array = list(set(new_array))
    
    return new_array
    

[docs]def create_tokenized_cooccurrences_column(tokenized_text_column, n_items, context_window, unique_cooc):
    """
    A Function that will extract co-occurrences from tokens if this was set by the user. 
    Used to extract co-occurrences at the column level.
    
    Parameters
    ----------
    tokenized_text_column: pandas.Series
        A series containing the already tokenized texts.
    n_items: int
        The number of co-occurring tokens we should consider. Corresponds to `n_cooc` 
        set by the user in InspectorArgs.
    context_window: int
        Size of the context window for co-occurrences, corresponding to `cooc_window_size` 
        in InspectorArgs.
    unique_cooc: bool
        A boolean for whether to consider unique co-occurrences. If True, multiple 
        occurrences of the same token in a text will be discarded.

    Returns
    -------
    text_column: pandas.Series
        The same tokenized series as input (overall length of the series will be the same), 
        but with co-occurrences in lieu of the original tokens (meaning sequence length will 
        be far lengthier).
    """
    
    if n_items > context_window and context_window!=0:
        sys.exit(f"ERROR: The size of the context windows cannot be lower than the number of words when extracting the cooccurrences!\nExit.")
    tqdm.pandas()

    tokenized_text_column = tokenized_text_column.squeeze().progress_apply(lambda x: extract_combinations(x,n_items,context_window,unique_cooc))
    
    return tokenized_text_column
                                                            

[docs]def get_label_values(input_dataframe, col_names_dict):
    """Returns a dictionary with all unique label values for the specified variables.
    
    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        The dataset to be analyzed.
    col_names_dict: Dict
        A dictionary containing the var_names provided by the user.
    
    Returns
    -------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in 
        the input dataset.
    """

    current_labels = col_names_dict[utils.LABEL_COLS_KEY]
    
    # Create dictionary with names of label columns and label values
    label_values_dict = {}
    for label in current_labels:
        label_values_dict[label] = pd.unique(input_dataframe[label]).tolist()
    return label_values_dict


[docs]def update_label_values_dict_with_inters(label_values_dict, text_names):
    """
    Updates label_values_dict with the intersection names if we have more than 1 var_name 
    or text_name.
    
    Parameters
    ----------
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the 
        input dataset.
    text_names: List
        The list of text column names.
    
    Returns
    -------
    inters_label_values_dict: Dict
        A dictionary containing all of the possible intersections of text columns and 
        variables in the input dataset.    
    """

    inters_label_values_dict = {}
    current_var_values = list(label_values_dict.values())
    current_vars = list(label_values_dict.keys())
    # n_vars = len(label_values_dict.keys())
    var_combination_name = "::".join(current_vars)
    if len(text_names) > 1:
        var_combination_name = f"text_name::{var_combination_name}"
        # print(current_var_values)
        current_var_values.append(text_names)
    inters_label_values_dict[var_combination_name] = []
    subset_intersections = itertools.product(*current_var_values)
    for intersection in subset_intersections:
        intersection_name = "::".join(map(str, intersection))
        inters_label_values_dict[var_combination_name].append(intersection_name)

    return inters_label_values_dict


[docs]def get_subset_dict(input_dataframe, tok_columns_dict, label_values_dict):
    """
    Creates a dictionary containing all the desired subsets of the dataset we will be analyzing.
    
    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        The dataset to be analyzed.
    tok_columns_dict: Dict
        A dictionary containing the names of the columns containing the tokenized specified 
        text columns.
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take in the 
        input dataset.
        
    Returns
    -------
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each variable value 
        specified by the user.
    """

    current_vars = label_values_dict.keys()
    subsets_of_interest = {}
    # loop through all columns containing text
    for text_column in tok_columns_dict:
        tokenized_text_column = tok_columns_dict[text_column]
        # Loop through all columns containing labels
        for label in current_vars:
            current_label_subset = []
            for label_value in label_values_dict[label]:
                df_slice_with_current_label = input_dataframe[(input_dataframe[label] == label_value)]
                series_with_current_label = df_slice_with_current_label[tokenized_text_column]
                # if the series contains 2 or more elements, we squeeze it
                if len(series_with_current_label) > 1:
                    series_with_current_label = series_with_current_label.squeeze()
                series_with_current_label = series_with_current_label.rename(label_value)
                current_label_subset.append(series_with_current_label)
            subsets_of_interest[label] = current_label_subset   

    return subsets_of_interest    


[docs]def get_subset_intersections(input_dataframe, tok_columns_dict, label_values_dict):
    """
    Creates a dictionary containing all the desired subsets of the dataset we will be 
    analyzing if we have intersections among different text or var columns.
    
    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        The dataset to be analyzed.
    tok_columns_dict: Dict
        A dictionary containing the names of the columns containing the tokenized 
        specified text columns.
    label_values_dict: Dict
        A dictionary containing all of the possible values each variable can take 
        in the input dataset.
        
    Returns
    -------
    subsets_of_interest: Dict
        A dictionary containing a pandas series with tokenized texts for each 
        variable/text column combination out of the variables and text columns 
        specified by the user in the case of multiple text and variable columns.
    """
        
    current_var_values = list(label_values_dict.values())
    current_vars = list(label_values_dict.keys())
    n_vars = len(label_values_dict.keys())
    text_cols = list(tok_columns_dict.keys())
    var_combination_name = "::".join(current_vars)
    if len(text_cols) > 1:
        subsets_of_interest = {f"text_name::{var_combination_name}": []}
    else:
        subsets_of_interest = {var_combination_name: []}
    for text_column in text_cols:
        print("INFO: Splitting intersections of variables into subsets.")
        print(f"Subsets for text column '{text_column}'...")
        tokenized_text_column = tok_columns_dict[text_column]
        subset_intersections = list(itertools.product(*current_var_values))
        for i in tqdm(range(len(subset_intersections))):
            intersection = subset_intersections[i]
            current_subset = input_dataframe
            intersection_name = "::".join(map(str, intersection))
            if len(text_cols) > 1:
                intersection_name = f"{text_column}::{intersection_name}"
            for i in range(n_vars):
                current_subset = current_subset[(current_subset[current_vars[i]] == intersection[i])]
            # if the series contains 2 or more elements, we squeeze it
            if len(current_subset) > 1:
                current_subset = current_subset.squeeze()
            series_with_current_inters = current_subset[tokenized_text_column]
            subsets_of_interest[intersection_name] = current_subset
            series_with_current_inters = series_with_current_inters.rename(intersection_name)
            if len(text_cols) == 1:
                subsets_of_interest[var_combination_name].append(series_with_current_inters)
            else:
                subsets_of_interest[f"text_name::{var_combination_name}"].append(series_with_current_inters)
    
    return subsets_of_interest