Utility for `Pandas`
from fastcore.all import *
from nbdev.export import Config
from PIL import Image

from gale.collections.download import download_and_extract_archive
from gale.utils.display import show_images

data_path = Path(Config().path("nbs_path")) / "data"

# download a toy dataset
download_and_extract_archive(
    url="https://download.pytorch.org/tutorial/hymenoptera_data.zip",
    download_root=data_path,
    extract_root=data_path,
)
path = data_path / "hymenoptera_data"
path.ls()
Using downloaded and verified file: /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data.zip
Extracting /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data.zip to /Users/ayushman/Desktop/gale/nbs/data
(#2) [Path('/Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train'),Path('/Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/val')]

folder2df[source]

folder2df(directory:Union[str, Path], extensions:list=('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'), shuffle:bool=False, seed:int=42)

Parses all the Images in directory and puts them in a DataFrame object.

Arguments:

  • directory: path to dirs, for example /home/user/data/**
  • extensions: data extension of the Images.
  • shuffle: shuffles the resulting DataFrame object.
  • seed: sets seed for reproducibilty
df = folder2df(path / "train")
df.head()
Found 244 files belonging to 2 classes.
image_id target
0 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2638074627_6b3ae746a0.jpg bees
1 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/507288830_f46e8d4cb2.jpg bees
2 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2405441001_b06c36fa72.jpg bees
3 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2962405283_22718d9617.jpg bees
4 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/446296270_d9e8b93ecf.jpg bees
ims = [
    Image.open(df["image_id"][0]),
    Image.open(df["image_id"][1]),
    Image.open(df["image_id"][2]),
    Image.open(df["image_id"][3]),
]
titles = [df["target"][0], df["target"][1], df["target"][2], df["target"][3]]

show_images(ims, titles=titles)

split_dataframe_into_stratified_folds[source]

split_dataframe_into_stratified_folds(dataframe:DataFrame, label_column:str, fold_column:str=None, n_splits=5, shuffle=False, random_state=None)

Makes stratified folds in dataframe. label_column is the column to use for split. Split Id is given in fold_column. Set random_state for reproducibility.

get_dataframe_fold[source]

get_dataframe_fold(dataframe:DataFrame, split_column:str, split_idx:int)

Grab the train and validation splits from the dataframe. Splits are inferred from split_column. The columns with split_idx are the validation columns and rest are train columns.

stratified_df = split_dataframe_into_stratified_folds(
    dataframe=df, label_column="target", n_splits=3
)

# get all splits from the Datasets
trn_0, val_0 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=0)
trn_1, val_1 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=1)
trn_2, val_2 = get_dataframe_fold(stratified_df, split_column="kfold", split_idx=2)

get_dataset_labeling[source]

get_dataset_labeling(dataframe:DataFrame, label_column:str)

Prepares a mapping using unique values from label_columns. Returns: a dictionary mapping from tag to labels

get_dataset_labeling(stratified_df, label_column="target")
{'ants': 0, 'bees': 1}

dataframe_labels_2_int[source]

dataframe_labels_2_int(dataframe:DataFrame, label_column:str, return_labelling:bool=False)

Converts the labels of the dataframe in label_column to integers. Set return_labelling to return the dictionary for labels.

df3 = dataframe_labels_2_int(df, label_column="target")
df3.head()
image_id target
0 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2638074627_6b3ae746a0.jpg 1
1 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/507288830_f46e8d4cb2.jpg 1
2 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2405441001_b06c36fa72.jpg 1
3 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/2962405283_22718d9617.jpg 1
4 /Users/ayushman/Desktop/gale/nbs/data/hymenoptera_data/train/bees/446296270_d9e8b93ecf.jpg 1

split_dataframe_train_test[source]

split_dataframe_train_test(dataframe:DataFrame, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)

Split dataframe in train and test part.

train_df, val_df = split_dataframe_train_test(
    dataframe=df,
    test_size=0.5,
    train_size=0.5,
    random_state=42,
    stratify=df["target"],
    shuffle=True,
)

test_eq(len(train_df), len(val_df))