Utility for `Pandas`
from fastcore.all import *
from nbdev.export import Config
from PIL import Image
from gale.collections.download import download_and_extract_archive
from gale.utils.display import show_images
data_path = Path(Config().path("nbs_path")) / "data"
# download a toy dataset
download_and_extract_archive(
url="https://download.pytorch.org/tutorial/hymenoptera_data.zip",
download_root=data_path,
extract_root=data_path,
)
path = data_path / "hymenoptera_data"
path.ls()
df = folder2df(path / "train")
df.head()
ims = [
Image.open(df["image_id"][0]),
Image.open(df["image_id"][1]),
Image.open(df["image_id"][2]),
Image.open(df["image_id"][3]),
]
titles = [df["target"][0], df["target"][1], df["target"][2], df["target"][3]]
show_images(ims, titles=titles)
stratified_df = split_dataframe_into_stratified_folds(
dataframe=df, label_column="target", n_splits=3
)
# get all splits from the Datasets
trn_0, val_0 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=0)
trn_1, val_1 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=1)
trn_2, val_2 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=2)
get_dataset_labeling(stratified_df, label_column="target")
df3 = dataframe_labels_2_int(df, label_column="target")
df3.head()
train_df, val_df = split_dataframe_train_test(
dataframe=df,
test_size=0.5,
train_size=0.5,
random_state=42,
stratify=df["target"],
shuffle=True,
)
test_eq(len(train_df), len(val_df))