Files
Made-With-ML/madewithml/data.py
2023-09-18 22:03:20 -07:00

152 lines
5.5 KiB
Python

import re
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
import ray
from ray.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from madewithml.config import STOPWORDS
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
"""Load data from source into a Ray Dataset.
Args:
dataset_loc (str): Location of the dataset.
num_samples (int, optional): The number of samples to load. Defaults to None.
Returns:
Dataset: Our dataset represented by a Ray Dataset.
"""
ds = ray.data.read_csv(dataset_loc)
ds = ds.random_shuffle(seed=1234)
ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
return ds
def stratify_split(
ds: Dataset,
stratify: str,
test_size: float,
shuffle: bool = True,
seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
"""Split a dataset into train and test splits with equal
amounts of data points from each class in the column we
want to stratify on.
Args:
ds (Dataset): Input dataset to split.
stratify (str): Name of column to split on.
test_size (float): Proportion of dataset to split for test set.
shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
seed (int, optional): seed for shuffling. Defaults to 1234.
Returns:
Tuple[Dataset, Dataset]: the stratified train and test datasets.
"""
def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Naively split a dataframe into train and test splits.
Add a column specifying whether it's the train or test split."""
train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
train["_split"] = "train"
test["_split"] = "test"
return pd.concat([train, test])
def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Filter by data points that match the split column's value
and return the dataframe with the _split column dropped."""
return df[df["_split"] == split].drop("_split", axis=1)
# Train, test split with stratify
grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on
train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine
test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine
# Shuffle each split (required)
train_ds = train_ds.random_shuffle(seed=seed)
test_ds = test_ds.random_shuffle(seed=seed)
return train_ds, test_ds
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
"""Clean raw text string.
Args:
text (str): Raw text to clean.
stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS.
Returns:
str: cleaned text.
"""
# Lower
text = text.lower()
# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub(" ", text)
# Spacing and filters
text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip() # strip white space at the ends
text = re.sub(r"http\S+", "", text) # remove links
return text
def tokenize(batch: Dict) -> Dict:
"""Tokenize the text input in our batch using a tokenizer.
Args:
batch (Dict): batch of data with the text inputs to tokenize.
Returns:
Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
"""
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
"""Preprocess the data in our dataframe.
Args:
df (pd.DataFrame): Raw dataframe to preprocess.
class_to_index (Dict): Mapping of class names to indices.
Returns:
Dict: preprocessed data (ids, masks, targets).
"""
df["text"] = df.title + " " + df.description # feature engineering
df["text"] = df.text.apply(clean_text) # clean text
df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe
df = df[["text", "tag"]] # rearrange columns
df["tag"] = df["tag"].map(class_to_index) # label encoding
outputs = tokenize(df)
return outputs
class CustomPreprocessor:
"""Custom preprocessor class."""
def __init__(self, class_to_index={}):
self.class_to_index = class_to_index or {} # mutable defaults
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
def fit(self, ds):
tags = ds.unique(column="tag")
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
return self
def transform(self, ds):
return ds.map_batches(preprocess, fn_kwargs={"class_to_index": self.class_to_index}, batch_format="pandas")