How to Prepare Tabular Datasets and Models¶

This guide will bring you through examples on how to prepare datasets and models for AI Verify, for the following model types:

Binary Classification
Multiclass Classification
Regression
Pipeline

This tutorial uses the scikit-learn framework as an example.

If you would like to download and follow through this guide, you may download the relevant files via this link.

In [1]:

Copied!





import pandas as pd
import pickle
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

1. Sample Binary Classification Data and Model Preparation¶

Firstly, load the training and testing dataset

In [2]:

Copied!





train = pickle.load(open('data/pickle_pandas_mock_binary_classification_credit_risk_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_binary_classification_credit_risk_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)
train = pickle.load(open('data/pickle_pandas_mock_binary_classification_credit_risk_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_binary_classification_credit_risk_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)

Training dataset:

	age	gender	income	race	home_ownership	prior_count	loan_amount	loan_interests	default
0	48	1	67765	4	1	50	848525	0.471460	0
1	37	0	8394	1	0	55	70622	0.332984	0
2	55	0	98258	0	0	82	718975	0.794532	0
3	65	1	39496	1	0	8	428136	0.810374	1
4	79	1	57483	4	1	14	60585	0.878831	0
...	...	...	...	...	...	...	...	...	...
4995	46	0	97108	4	1	67	544253	0.206295	1
4996	39	0	94219	1	1	79	148536	0.064734	0
4997	57	1	17000	0	1	48	96803	0.399618	1
4998	56	0	43967	5	1	11	183472	0.369382	1
4999	57	0	36112	0	1	15	949435	0.584881	1

5000 rows × 9 columns

Testing dataset

	age	gender	income	race	home_ownership	prior_count	loan_amount	loan_interests	default
0	86	1	64570	2	1	41	872014	0.628734	1
1	32	1	46194	4	1	46	39121	0.518054	1
2	30	1	89651	5	0	48	587940	0.414793	0
3	24	1	45865	2	0	43	77608	0.524197	0
4	32	0	79513	2	0	15	145595	0.687570	1
...	...	...	...	...	...	...	...	...	...
2495	41	1	89237	0	0	57	511226	0.674244	1
2496	61	1	15527	0	1	25	325019	0.394591	0
2497	82	1	71102	4	0	9	172144	0.721172	0
2498	83	1	54126	3	1	86	127325	0.296527	0
2499	72	1	38403	4	1	21	365896	0.631007	1

2500 rows × 9 columns

In [3]:

Copied!





X_train = train.drop("default", axis=1) 
y_train = train[["default"]] # default is the ground truth
X_test = test.drop("default", axis=1) 
y_test = test[["default"]] # default is the ground truth
X_train = train.drop("default", axis=1) 
y_train = train[["default"]] # default is the ground truth
X_test = test.drop("default", axis=1) 
y_test = test[["default"]] # default is the ground truth

Next, train a simple logistic regression model and save the model file for upload into AI Verify

In [4]:

Copied!





classifier = LogisticRegression(fit_intercept=True)
classifier.fit(X_train, y_train)
y_pred_lr = classifier.predict(X_test)
score = f1_score(y_pred_lr, y_test)
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

# Join the labels back to the original data
X_test["default"] = y_test["default"]

pickle.dump(classifier, open("model/binary_classification_mock_credit_risk_sklearn.linear_model._logistic.LogisticRegression.sav", "wb+"))
classifier = LogisticRegression(fit_intercept=True)
classifier.fit(X_train, y_train)
y_pred_lr = classifier.predict(X_test)
score = f1_score(y_pred_lr, y_test)
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

# Join the labels back to the original data
X_test["default"] = y_test["default"]

pickle.dump(classifier, open("model/binary_classification_mock_credit_risk_sklearn.linear_model._logistic.LogisticRegression.sav", "wb+"))

Accuracy: 0.51(+/- 0.04)
F1 Score: 0.57

For the above example, users may upload the following for testing:

Data: 'data/pickle_pandas_mock_binary_classification_credit_risk_testing.sav'
Ground Truth Dataset: 'data/pickle_pandas_mock_binary_classification_credit_risk_testing.sav' ; Select Ground Truth : default
Model: 'model/binary_classification_mock_credit_risk_sklearn.linear_model._logistic.LogisticRegression.sav'

2. Sample Multiclass Classification Data and Model Preparation¶

Firstly, load the training and testing dataset

In [5]:

Copied!





train = pickle.load(open('data/pickle_pandas_mock_multiclass_classification_toxic_classification_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_multiclass_classification_toxic_classification_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)
train = pickle.load(open('data/pickle_pandas_mock_multiclass_classification_toxic_classification_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_multiclass_classification_toxic_classification_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)

Training dataset:

	age	gender	race	ban_count	prior_count	toxic_words	toxic
0	37	0	5	759	730	98950	1
1	18	1	2	960	332	68385	3
2	50	1	4	904	38	13578	2
3	55	1	3	351	633	38542	2
4	26	0	5	897	238	21792	3
...	...	...	...	...	...	...	...
495	74	1	2	666	735	17929	1
496	36	0	1	707	844	5651	4
497	29	1	0	169	461	1910	0
498	40	1	1	120	922	80071	4
499	83	0	2	731	351	93608	1

500 rows × 7 columns

Testing dataset

	age	gender	race	ban_count	prior_count	toxic_words	toxic
0	73	0	4	405	397	1877	3
1	78	0	1	705	866	31332	4
2	32	0	5	204	382	28503	3
3	84	0	0	247	422	32166	3
4	48	0	2	150	593	17861	4
...	...	...	...	...	...	...	...
245	50	1	4	285	578	34200	0
246	77	1	0	508	433	98875	4
247	22	0	5	811	138	64705	0
248	56	0	4	532	69	61868	0
249	23	0	5	133	239	56596	0

250 rows × 7 columns

In [6]:

Copied!





X_train = train.drop("toxic", axis=1) 
y_train = train[["toxic"]] # toxic is the ground truth
X_test = test.drop("toxic", axis=1) 
y_test = test[["toxic"]] # toxic is the ground truth
X_train = train.drop("toxic", axis=1) 
y_train = train[["toxic"]] # toxic is the ground truth
X_test = test.drop("toxic", axis=1) 
y_test = test[["toxic"]] # toxic is the ground truth

In [7]:

Copied!





classifier = LogisticRegression(fit_intercept=True)
classifier.fit(X_train, y_train)
y_pred_lr = classifier.predict(X_test)
score = f1_score(y_pred_lr, y_test, average='micro')
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

# Join the labels back to the original data
X_test["toxic"] = y_test["toxic"]

pickle.dump(classifier, open("model/multiclass_classification_mock_toxic_classification_sklearn.linear_model._logistic.LogisticRegression.sav", "wb+"))
classifier = LogisticRegression(fit_intercept=True)
classifier.fit(X_train, y_train)
y_pred_lr = classifier.predict(X_test)
score = f1_score(y_pred_lr, y_test, average='micro')
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

# Join the labels back to the original data
X_test["toxic"] = y_test["toxic"]

pickle.dump(classifier, open("model/multiclass_classification_mock_toxic_classification_sklearn.linear_model._logistic.LogisticRegression.sav", "wb+"))

Accuracy: 0.21(+/- 0.03)
F1 Score: 0.23

For the above example, users may upload the following for testing:

Data: 'data/pickle_pandas_mock_multiclass_classification_toxic_classification_testing.sav'
Ground Truth Dataset: 'data/pickle_pandas_mock_multiclass_classification_toxic_classification_testing.sav' ; Select Ground Truth : toxic
Model: 'model/multiclass_classification_mock_toxic_classification_sklearn.linear_model._logistic.LogisticRegression.sav'

3. Sample Regression Data and Model Preparation¶

Firstly, load the training and testing dataset

In [8]:

Copied!





train = pickle.load(open('data/pickle_pandas_mock_regression_donation_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_regression_donation_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)
train = pickle.load(open('data/pickle_pandas_mock_regression_donation_training.sav','rb'))
test = pickle.load(open('data/pickle_pandas_mock_regression_donation_testing.sav','rb'))
print("Training dataset:")
display(train)
print("Testing dataset")
display(test)

Training dataset:

	age	gender	race	income	employment	employment_length	total_donated_amount	number_of_donation	donation
0	73	0	4	855779	1	74	4505	606	17081
1	46	1	3	123349	0	9	610835	9	21254
2	58	1	5	743050	1	34	152778	415	53137
3	42	0	0	914273	1	21	968674	420	64253
4	50	0	0	281412	1	19	633223	992	27056
...	...	...	...	...	...	...	...	...	...
495	72	1	3	722808	0	31	736206	484	93661
496	54	0	0	141898	0	26	53598	149	3476
497	82	1	0	222256	0	70	174650	505	34612
498	68	1	4	621078	1	66	336243	456	53985
499	58	0	1	439528	1	95	957597	7	49153

500 rows × 9 columns

Testing dataset

	age	gender	race	income	employment	employment_length	total_donated_amount	number_of_donation	donation
0	56	1	3	279800	0	24	835513	459	70836
1	27	1	5	445728	1	64	899407	350	11806
2	61	0	3	252790	1	37	209458	775	5801
3	80	1	1	756310	0	30	256372	519	44327
4	42	1	4	416035	1	73	528901	722	21318
...	...	...	...	...	...	...	...	...	...
245	20	1	5	593096	0	46	176847	227	2824
246	52	1	1	643703	0	81	818788	892	62909
247	66	1	1	986261	0	53	753968	633	89891
248	22	0	4	544797	1	9	5397	380	74806
249	59	0	1	140192	1	42	606265	490	90880

250 rows × 9 columns

In [9]:

Copied!





X_train = train.drop("donation", axis=1) 
y_train = train[["donation"]] # donation is the ground truth
X_test = test.drop("donation", axis=1) 
y_test = test[["donation"]] # donation is the ground truth
X_train = train.drop("donation", axis=1) 
y_train = train[["donation"]] # donation is the ground truth
X_test = test.drop("donation", axis=1) 
y_test = test[["donation"]] # donation is the ground truth

In [10]:

Copied!





regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train, y_train)
y_pred_lr = regression_model.predict(X_test)
training_score = cross_val_score(regression_model, X_test, y_test, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("Mean Squared Error regression loss %0.2f" % (mean_squared_error(y_test, y_pred_lr)))

# Join the labels back to the original data
X_test["donation"] = y_test["donation"]

pickle.dump(classifier, open("model/regression_mock_donation_sklearn.linear_model._base.LinearRegression.sav", "wb+"))
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train, y_train)
y_pred_lr = regression_model.predict(X_test)
training_score = cross_val_score(regression_model, X_test, y_test, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("Mean Squared Error regression loss %0.2f" % (mean_squared_error(y_test, y_pred_lr)))

# Join the labels back to the original data
X_test["donation"] = y_test["donation"]

pickle.dump(classifier, open("model/regression_mock_donation_sklearn.linear_model._base.LinearRegression.sav", "wb+"))

Accuracy: -0.05(+/- 0.05)
Mean Squared Error regression loss 892177366.90

For the above example, users may upload the following for testing:

Data: 'data/pickle_pandas_mock_regression_donation_testing.sav'
Ground Truth Dataset: 'data/pickle_pandas_mock_regression_donation_testing.sav' ; Select Ground Truth : donation
Model: 'model/regression_mock_donation_sklearn.linear_model._base.LinearRegression.sav'

4. Sample Pipeline Model Preparation¶

As a demonstration, this example will guide users to create a simple pipeline to perform binary classification

Firstly, load the training datasets. In this example, the training data and the ground truth labels are contained in two separate files and have been loaded as X_train and y_train respectively.

In [11]:

Copied!





X_train = pickle.load(open('data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_training.sav','rb'))
y_train = pickle.load(open('data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_ytrain.sav','rb'))
print("Training dataset:")
display(train)
print("Training Ground Truth:")
display(y_train)
X_train = pickle.load(open('data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_training.sav','rb'))
y_train = pickle.load(open('data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_ytrain.sav','rb'))
print("Training dataset:")
display(train)
print("Training Ground Truth:")
display(y_train)

Training dataset:

	age	gender	race	income	employment	employment_length	total_donated_amount	number_of_donation	donation
0	73	0	4	855779	1	74	4505	606	17081
1	46	1	3	123349	0	9	610835	9	21254
2	58	1	5	743050	1	34	152778	415	53137
3	42	0	0	914273	1	21	968674	420	64253
4	50	0	0	281412	1	19	633223	992	27056
...	...	...	...	...	...	...	...	...	...
495	72	1	3	722808	0	31	736206	484	93661
496	54	0	0	141898	0	26	53598	149	3476
497	82	1	0	222256	0	70	174650	505	34612
498	68	1	4	621078	1	66	336243	456	53985
499	58	0	1	439528	1	95	957597	7	49153

500 rows × 9 columns

Training Ground Truth:

	default
0	1
1	0
2	1
3	0
4	1
...	...
495	1
496	1
497	0
498	1
499	0

500 rows × 1 columns

Defining and training the pipeline¶

With the training data and labels prepared, you may now define and train a custom pipeline. In this example, the pipeline is created to do feature engineering and make predictions with a final estimator.

In [12]:

Copied!





from sklearn.preprocessing import LabelEncoder

class featureEngineeringStage():
    def __init__(self, columns, selection):
        self.columns = columns
        self.selection = selection
    
    def transform(self, X, y=None):
        """Transform columns of X using LabelEncoder.
        """
        output = X.copy()
        for col in self.columns:
            output[col] = LabelEncoder().fit_transform(output[col])
        return output[self.selection]
    
    def fit(self, X, y=None):
        return self
from sklearn.preprocessing import LabelEncoder

class featureEngineeringStage():
    def __init__(self, columns, selection):
        self.columns = columns
        self.selection = selection
    
    def transform(self, X, y=None):
        """Transform columns of X using LabelEncoder.
        """
        output = X.copy()
        for col in self.columns:
            output[col] = LabelEncoder().fit_transform(output[col])
        return output[self.selection]
    
    def fit(self, X, y=None):
        return self

In [13]:

Copied!

pipe = Pipeline([
    ('featureEngineering', featureEngineeringStage(columns=["gender"], selection=["gender"])),
    ('model',  LogisticRegression(fit_intercept=True))])
pipe = Pipeline([
    ('featureEngineering', featureEngineeringStage(columns=["gender"], selection=["gender"])),
    ('model',  LogisticRegression(fit_intercept=True))])

Training the pipeline:

In [14]:

Copied!

pipe.fit(X_train, y_train)
pipe.fit(X_train, y_train)

Out[14]:

Pipeline(steps=[('featureEngineering',
                 <__main__.featureEngineeringStage object at 0x0000022DD51478B0>),
                ('model', LogisticRegression())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Save the trained pipeline:

In [15]:

Copied!

pickle.dump(pipe, open("pipeline/binary_classification_tabular_credit_loan/binary_classification_pipeline_credit_risk_sklearn.pipeline.Pipeline.sav", "wb+"))
pickle.dump(pipe, open("pipeline/binary_classification_tabular_credit_loan/binary_classification_pipeline_credit_risk_sklearn.pipeline.Pipeline.sav", "wb+"))

In [16]:

Copied!





X_test = pickle.load(open('./data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_testing.sav','rb'))
y_test = pickle.load(open('./data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_ytest.sav','rb'))
y_pred_lr = pipe.predict(X_test)
score = f1_score(y_pred_lr, y_test)
training_score = cross_val_score(pipe, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))
X_test = pickle.load(open('./data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_testing.sav','rb'))
y_test = pickle.load(open('./data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_ytest.sav','rb'))
y_pred_lr = pipe.predict(X_test)
score = f1_score(y_pred_lr, y_test)
training_score = cross_val_score(pipe, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

Accuracy: 0.54(+/- 0.01)
F1 Score: 0.00

To test pipeline models with AI Verify, prepare a model folder containing:

A python file containing the class files that is used in the pipeline (i.e.featureEngineeringStage in this example). Tip: Remember to include the relevant library imports.
The trained pipeline file (i.e. 'binary_classification_pipeline_credit_risk_sklearn.pipeline.Pipeline.sav' in this example)

An example of a pipeline model folder structure:

└── binary_classification_tabular_credit_loan
    ├── binary_classification_pipeline_credit_risk_sklearn.pipeline.Pipeline.sav
    └── creditCustomClass.py

For the above example, users may upload the following for testing:

Data: 'data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_testing.sav'
Ground Truth Dataset: 'data/pickle_pandas_mock_binary_classification_pipeline_credit_risk_ytest.sav' ; Select Ground Truth : default
Model: 'pipeline/binary_classification_tabular_credit_loan' ; Note that the model should be uploaded as a folder as it is a pipeline.