Source code for variationist.metrics.lexical_artifacts

import math
import numpy as np
import pandas as pd
import sys
from collections import Counter
from transformers import AutoTokenizer
from typing import List

from variationist.metrics import utils


# From: https://github.com/dhfbk/hate-speech-artifacts/blob/main/lexartifacts-package/src/lexartifacts/lexical_artifacts.py


[docs]def compute_pmi( w_count: Counter, l_count: Counter, w_l_count: Counter, num_texts: int ) -> pd.core.frame.DataFrame: """ A function that computes positive reweighted pointwise mutual information between tokens and labels, following the implementation by [1]. [1] Alan Ramponi and Sara Tonelli. 2022. Features or Spurious Artifacts? Data-centric Baselines for Fair and Robust Hate Speech Detection. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Parameters ---------- w_count: Counter[str:int] Token counts over the whole dataset (i.e., {"token1": 123, "token2": 20, ...}) l_count: Counter[str:int] Label counts over the whole dataset (i.e., {"label1": 42, "label2": 21, ...}) w_l_count: Counter[(str,str):int] Token and label counts over the whole dataset (i.e., {("token1", label1"): 12, ...}) num_texts: int Total number of texts in the dataset Returns ------- pd.core.frame.DataFrame Pandas dataframe with tokens as rows and classes as columns (namely, label_of_interest and "other"). Values in this matrix are PMI scores. """ pmi_scores = {l:{} for l in l_count.keys()} for l in l_count.keys(): for w in w_count.keys(): # P(w): occurrences of "w" in texts over the total number of texts (across labels) p_w = w_count[w] / float(num_texts) if (w, l) in w_l_count.keys(): # P(w|l): co-occurrences of "w" and "l" in texts over the number of texts with label l p_w_l = w_l_count[(w, l)] / float(l_count[l]) # PMI(w,l) = P(w|l)/P(w): pointwise mutual information pmi = math.log2(p_w_l / float(p_w)) # Adjustment factor; co-occurrences of "w" and "l" in texts adj_factor = w_l_count[(w, l)] # Reweighted PMI(w,l) = PMI(w,l)*adj_factor: reweighted PMI to account for low-frequency terms rpmi = pmi * adj_factor # Positive reweighted PMI(w,l): all values below 0 are normalized to EPSILON if rpmi <= 0.0: rpmi = utils.EPSILON # Add the scores to the dictionary pmi_scores[l][w] = rpmi return pd.DataFrame(pmi_scores)
[docs]def get_counts( texts: List[str], curr_label: str, label_of_interest: str, tokenizer: AutoTokenizer, tokenizer_type: str, stopwords: str = "en" ) -> (Counter, Counter, Counter): """ A function that calculates relevant counts about a specific label after tokenizing the text according to a given pretrained tokenizer. Parameters ---------- texts: List[str] Input texts belonging to a specific label "curr_label" curr_label: str Label whose examples will be counted and to which "texts" belong to label_of_interest: str Label that is the focus of the artifacts calculation tokenizer: AutoTokenizer HuggingFace's pretrained tokenizer to use tokenizer_type: str Name of the pretrained tokenizer according to HuggingFace (e.g., "bert-base-uncased") stopwords: str Language for the stopwords to be removed from lexical artifacts. Default: en (English) If None, all stopwords are instead retained in the list of lexical artifacts For now, only "en" is supported (with a default stopword list), more on next releases Returns ------- token_counter: Counter Token counts for the given label "curr_label" label_counter: Counter Label counts for the given label "curr_label" token_label_counter: Counter Token and label counts for the given label "curr_label" """ token_counter, label_counter, token_label_counter = Counter(), Counter(), Counter() for i in range(len(texts)): tokens = tokenizer.tokenize(texts[i]) label = label_of_interest if (curr_label == label_of_interest) else "other" label_counter[label] += 1 for token in set(tokens): if stopwords == "en": # Retain all tokens except stopwords if token.lstrip("Ġ") not in utils.EN_STOP_WORDS: if token != "": token_counter[token] += 1 token_label_counter[(token, label)] += 1 else: if token != "": token_counter[token] += 1 token_label_counter[(token, label)] += 1 return token_counter, label_counter, token_label_counter
[docs]def normalize_pmi(pmi_scores: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: """ A function that normalize a dataframe of PMI scores in [0,1], following the implementation by [1]. [1] Alan Ramponi and Sara Tonelli. 2022. Features or Spurious Artifacts? Data-centric Baselines for Fair and Robust Hate Speech Detection. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Parameters ---------- pmi_scores: pd.core.frame.DataFrame Pandas dataframe with tokens as rows and classes as columns (namely, label_of_interest and "other"). Values in this matrix are PMI scores. Returns ------- pmi_normalized: pd.core.frame.DataFrame Normalized pandas dataframe with tokens as rows and classes as columns (namely, label_of_interest and "other"). Values in this matrix are normalized PMI scores. """ def min_max_normalization(dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: """ An auxiliary function that performs the min-max normalization over log2 PMI scores. Parameters ---------- dataframe: pd.core.frame.DataFrame Pandas dataframe with tokens as rows and classes as columns (namely, label_of_interest and "other"). Values in this matrix are log2 PMI scores. Returns ------- df_normalized: pd.core.frame.DataFrame Normalized pandas dataframe with tokens as rows and classes as columns (namely, label_of_interest and "other"). Values in this matrix are min-max normalized log2 PMI scores. """ df_normalized = dataframe.copy() for column in df_normalized.columns: curr_column = df_normalized[column] column_min = df_normalized[column].min() column_max = df_normalized[column].max() df_normalized[column] = (curr_column - column_min) / (column_max - column_min) return df_normalized # Fill missing values with epsilon for calculating the log2 pmi_scores = pmi_scores.fillna(utils.EPSILON) # Normalize log2 PMI values in [0,1] (flattening negative values to zero) pmi_scores = np.log2(pmi_scores) pmi_scores[pmi_scores < 0.0] = 0.0 pmi_normalized = min_max_normalization(pmi_scores) return pmi_normalized
[docs]def compute( texts: List[str], labels: List[str], label_of_interest: str, method: str = "pmi", special_tokens: List[str] = [], add_emojis: bool = True, stopwords: str = "", pretrained_tokenizer: str = "bert-base-uncased", ) -> pd.core.frame.DataFrame: """ A function that computes lexical artifacts given an input dataset (texts and labels) and a label of interest. Additional parameters can be specified to e.g., exclude emojis from the computation of lexical artifacts, add special tokens to the tokenizer's vocabulary, and in the near future changing the method and the pretrained tokenizer. [1] Alan Ramponi and Sara Tonelli. 2022. Features or Spurious Artifacts? Data-centric Baselines for Fair and Robust Hate Speech Detection. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Parameters ---------- texts: List[str] Input texts (note: the ith text of "texts" must match the ith label of "labels") labels: List[str] Input labels (note: the ith label of "labels" must match the ith text of "texts") label_of_interest: str Label that is the focus of the artifacts calculation (note: it must be in "labels") method: str Algorithm to compute the contribution strength of each token to each label. Default: "pmi" For now, we support "pmi" as implemented in [1], more on next releases special_tokens: List[str] List of special tokens to add to the tokenizer's vocabulary. Default: [] add_emojis: bool Whether or not adding emojis to the tokenizer's vocabulary. Default: True If this is set to False, a special token "[EMOJI]" will be used for all emojis stopwords: str The language for the stopwords to be removed from lexical artifacts. Default: en (English) If None, all stopwords are instead retained in the list of lexical artifacts For now, only "en" is supported (with a default stopword list), more on next releases pretrained_tokenizer: str Name of the HuggingFace's pretrained tokenizer to use (e.g., "bert-base-uncased") For now, BPE-based tokenizers (e.g., RoBERTa-base, GPT2) would not filter stopword correctly, if requested, due to the "Ġ" special character. Thorough support on next releases Returns ------- sorted_pmi_scores: pd.core.frame.DataFrame Pandas dataframe with tokens as rows and label_of_interest as column. Values in this matrix are PMI scores following the implementation by [1]. """ # Ensure texts and labels are of the same size if len(texts) != len(labels): sys.exit(f"ERROR: The number of texts and labels do not match! Exit.") # Ensure the label of interest is actually in the label set if label_of_interest not in labels: sys.exit(f"ERROR: {label_of_interest} is not present in \"labels\"! Exit.") # Print a warning in case of very few examples if len(texts) <= 100: print(f"WARNING. It seems the dataset is so small ({len(texts)} examples). Note that this \ may affect the reliability of artifacts computation.") # Convert labels to string and keep track of unique labels labels = [str(label) for label in labels] unique_labels = list(Counter(labels).keys()) label_of_interest = str(label_of_interest) # Create a mapping dictionary label -> {text1, ..., textN} label_to_texts = {} for i in range(len(labels)): if labels[i] not in label_to_texts.keys(): label_to_texts[labels[i]] = [texts[i]] else: label_to_texts[labels[i]].append(texts[i]) # Initialize the pretrained tokenizer with special tokens tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer, use_fast=True) tok_special_tokens = (special_tokens+utils.EMOJIS_TOKENS) if (add_emojis == True) else (special_tokens+[utils.EMOJI_TOKEN]) special_tokens_dict = {'additional_special_tokens': tok_special_tokens} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) # Tokenize text, normalize labels, and count token/label/token-label occurrences token_counter, label_counter, token_label_counter = Counter(), Counter(), Counter() for curr_label in unique_labels: curr_token_counters, curr_label_counters, curr_token_label_counters = get_counts( label_to_texts[curr_label], curr_label, label_of_interest, tokenizer, pretrained_tokenizer, stopwords) token_counter += curr_token_counters label_counter += curr_label_counters token_label_counter += curr_token_label_counters # Get the total count of texts according to the labels that are taken into consideration texts_count = sum(label_counter.values()) # Calculate the contribution strength of each token to each label if method == "pmi": # Calculate pointwise mutual information and normalize scores in [0,1] pmi_scores = compute_pmi(token_counter, label_counter, token_label_counter, texts_count) pmi_scores_norm = normalize_pmi(pmi_scores) else: sys.exit("The method {method} is not supported. Exit.") # Sort results by label (in descending order) sorted_pmi_scores = pmi_scores_norm[label_of_interest].to_frame().sort_values( by=[label_of_interest], ascending=False) return sorted_pmi_scores