Source code for variationist.data.tokenization_utils

import pandas as pd
import re
from tqdm import tqdm
from transformers import AutoTokenizer

from variationist import utils
from variationist import inspector


[docs]def whitespace_tokenization(text_column: pd.Series, 
                            args):
    """Takes as input an array/series of texts and tokenizes it, returns same array/series but tokenized splitting on whitespace.
    
    Parameters
    ----------
    text_column: pandas.Series
        A pandas Series of text that should be tokenized.
    args: InspectorArgs
        The InspectorArgs that were passed to Inspector.
    
    Returns
    -------
    tok_column: pandas.Series
        A pandas Series containing the initial texts but tokenized.
    """     
    
    tqdm.pandas()
    if args.lowercase:
        tok_column = text_column.squeeze().apply(lambda x: str(x).lower())
    else:
        tok_column = text_column.squeeze().astype(str)
        
    tok_column = tok_column.progress_apply(lambda x: utils.replace_symbols(x))
    tok_column = tok_column.apply(lambda x: re.sub(r'\s+', ' ', x))
    tok_column = tok_column.apply(lambda x: x.strip().split(" "))
    # tok_column = tok_column.squeeze().apply(lambda x: pd.Series(x.split(" ")))
    return tok_column


[docs]def huggingface_tokenization(text_column: pd.Series, 
                             args):
    """Takes as input an series of texts and tokenizes it, returns same series but tokenized using the huggingface tokenizer specified in the InspectorArgs.
    
    Parameters
    ----------
    text_column: pandas.Series
        A pandas Series of text that should be tokenized.
    args: InspectorArgs
        The InspectorArgs that were passed to Inspector.
    
    Returns
    -------
    tok_column:: pandas.Series
        A pandas Series containing the initial texts but tokenized.
    """
    tokenizer_name = args.tokenizer.strip("hf::")
    hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tqdm.pandas()
    nulls = text_column.isnull()
    if nulls.values.any():
        print(f"INFO: we detected one or more null value in the provided text column (indices {list(nulls[nulls].index)}. We will substitute them with an empty string.")
        text_column = text_column.fillna("")
    tok_column = text_column.squeeze().progress_apply(hf_tokenizer.encode, add_special_tokens=False)
    tok_column = tok_column.squeeze().apply(hf_tokenizer.convert_ids_to_tokens)
    return tok_column