developer.chat
22 September 2024
category
In [1]:
use_review_text = True # Change it to False, if you don't want review text included for training
use_count_vectorization = True # Change it to False to exclude count_vectorization
In [2]:
if not use_review_text:
# Without review text.
df_types_filename = '../input/airline-reviews-eda-and-preprocessing-pt-1/PreprocessedDataLightTypes.csv'
df_filename = '../input/airline-reviews-eda-and-preprocessing-pt-1/PreprocessedDataLight.csv'
df_out_filename = './Preds-WithoutText.csv'
else:
# With review text.
df_types_filename = '../input/airline-review-data-preprocessing-pt-2-nlp/NLPFinalDataLightTypes.csv'
df_filename = '../input/airline-review-data-preprocessing-pt-2-nlp/NLPFinalDataLight.csv'
df_out_filename = './Preds-WithText.csv'
In [3]:
# Define numerical and categorical features.
if not use_review_text:
# Without review text.
num_feats = ['date_flown_month',
'date_flown_year',
'review_date_date_flown_distance_days',
'review_characters',
'has_layover_num',
'seat_comfort',
'cabin_service',
'food_bev',
'entertainment',
'ground_service',
'value_for_money']
cat_feats = ['airline',
'traveller_type',
'cabin']
else:
# With review text.
if not use_count_vectorization:
num_feats = ['date_flown_month',
'date_flown_year',
'review_date_date_flown_distance_days',
'review_characters',
'has_layover_num',
'seat_comfort',
'cabin_service',
'food_bev',
'entertainment',
'ground_service',
'value_for_money',
'polarity']
else:
with open('../input/airline-review-data-preprocessing-pt-2-nlp/VecReviewTextCleanFeats.csv','r') as f:
vec_feats = f.read()
vec_feats = vec_feats.split(', ')
num_feats = ['date_flown_month',
'date_flown_year',
'review_date_date_flown_distance_days',
'review_characters',
'has_layover_num',
'seat_comfort',
'cabin_service',
'food_bev',
'entertainment',
'ground_service',
'value_for_money',
'polarity'] + vec_feats
cat_feats = ['airline',
'traveller_type',
'cabin']
feats = num_feats + cat_feats
In [4]:
# Set this variable to the desired method for data transformation.
# Possible options are: scaling_and_one_hot_encoding, label_encoding, no_transformation.
transform_dataset = 'label_encoding'
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('Set2')
import scipy.sparse
import datetime as dt
import dateutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb
import importlib
In [6]:
# Type of each field in the input data.
df_dtype = pd.read_csv(df_types_filename)
dict_dtype = df_dtype[['index','dtypes']].set_index('index').to_dict()['dtypes']
dict_dtype['recommended'] = 'bool'
In [7]:
# Input data.
df = pd.read_csv(df_filename, dtype=dict_dtype, keep_default_na=False, na_values=['_'])
df.drop(columns=['Unnamed: 0'],inplace=True)
In [8]:
df.head()
Out[8]:
In [9]:
df.shape
Out[9]:
In [10]:
n_reviews = df.shape[0]
print('Number of customer reviews in the dataset: {:d}'.format(n_reviews))
In [11]:
# Utility function to assign the label to our dataset
def assign_label_recommended(df_row):
"""
Return 0 if not recommended and 1 otherwise.
"""
label_recommended = None
if df_row['recommended'] == True:
label_recommended = 1
elif df_row['recommended'] == False:
label_recommended = 0
else:
label_recommended = None
return label_recommended
In [12]:
df['label'] = df.apply(lambda x: assign_label_recommended(x), axis=1)
In [13]:
df.head()
Out[13]:
In [14]:
df['has_layover_num'] = df['has_layover'].astype(int)
df['date_flown_day'] = df['date_flown_day'].astype(int)
df['date_flown_month'] = df['date_flown_month'].astype(int)
df['date_flown_year'] = df['date_flown_year'].astype(int)
df['seat_comfort'] = df['seat_comfort'].astype(int)
df['cabin_service'] = df['cabin_service'].astype(int)
df['ground_service'] = df['ground_service'].astype(int)
df['food_bev'] = df['food_bev'].astype(int)
df['value_for_money'] = df['value_for_money'].astype(int)
df['entertainment'] = df['entertainment'].astype(int)
for feat in num_feats:
if 'polarity' not in feat:
df[feat] = df[feat].astype(int)
In [15]:
df.head()
Out[15]:
In [16]:
X = df[feats]
y = df['label'].values
In [17]:
f_rec = (y[y==1].shape[0])/y.shape[0]
f_not_rec = (y[y==0].shape[0])/y.shape[0]
print('Fraction of customers that recommeded the service: {:.2f}'.format(f_rec))
print('Fraction of customers that did not recommed the service: {:.2f}'.format(f_not_rec))
In [18]:
# Create a pipeline for numerical features and a pipeline for categorical features.
num_proc = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'), StandardScaler())
cat_proc = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore'))
# Create a preprocessing step for all features.
preprocessor = make_column_transformer((num_proc, num_feats),
(cat_proc, cat_feats))
In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
In [20]:
X_train_transformed = preprocessor.fit_transform(X_train)
In [21]:
cat_feats_one_hot = preprocessor.transformers_[1][1]['onehotencoder'].get_feature_names(cat_feats)
# print(cat_feats_one_hot)
all_feats = list(num_feats)+list(cat_feats_one_hot)
# print(all_feats)
dict_for_renaming_cols = {}
for i in range(len(all_feats)):
dict_for_renaming_cols[i] = all_feats[i]
# print(dict_for_renaming_cols)
In [22]:
if scipy.sparse.issparse(X_train_transformed):
X_train_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_train_transformed)
else:
X_train_transformed_2 = pd.DataFrame(X_train_transformed)
X_train_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)
X_test_transformed = preprocessor.transform(X_test)
if scipy.sparse.issparse(X_test_transformed):
X_test_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_test_transformed)
else:
X_test_transformed_2 = pd.DataFrame(X_test_transformed)
X_test_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)
X_transformed = preprocessor.transform(X)
if scipy.sparse.issparse(X_transformed):
X_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_transformed)
else:
X_transformed_2 = pd.DataFrame(X_transformed)
X_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)
In [23]:
X_train.shape
Out[23]:
In [24]:
X_train_transformed_2.shape
Out[24]:
In [25]:
X_test.shape
Out[25]:
In [26]:
X_test_transformed_2.shape
Out[26]:
In [27]:
le = LabelEncoder()
In [28]:
# Make copies so that original aren't changed
X_label_enc = X.copy()
X_train_label_enc = X_train.copy()
X_test_label_enc = X_test.copy()
In [29]:
for feat in cat_feats:
print('Feature:', feat)
X_label_enc[feat] = le.fit_transform(X_label_enc[feat])
X_train_label_enc[feat] = le.fit_transform(X_train_label_enc[feat])
X_test_label_enc[feat] = le.fit_transform(X_test_label_enc[feat])
In [30]:
X_label_enc[cat_feats].head()
Out[30]:
In [31]:
if transform_dataset == 'scaling_and_one_hot_encoding':
print('Method for data tranformation: scaling and one hot encoding')
X_train_for_model = X_train_transformed_2
X_test_for_model = X_test_transformed_2
X_for_model = X_transformed_2
X_test_for_shap = X_test_transformed_2
X_for_shap = X_transformed_2
elif transform_dataset == 'label_encoding':
print('Method for data transformation: label encoding')
X_train_for_model = X_train_label_enc
X_test_for_model = X_test_label_enc
X_for_model = X_label_enc
X_test_for_shap = X_test_label_enc
X_for_shap = X_label_enc
elif transform_dataset == 'no_transformation':
print('Method for data transformation: no transformation')
X_train_for_model = X_train
X_test_for_model = X_test
X_for_model = X
X_test_for_shap = X_test
X_for_shap = X
In [32]:
cat_feats
Out[32]:
In [33]:
if transform_dataset == 'scaling_and_one_hot_encoding':
train_data=lgb.Dataset(X_train_for_model,label=y_train)
test_data=lgb.Dataset(X_test_for_model,label=y_test)
elif transform_dataset == 'label_encoding':
train_data=lgb.Dataset(X_train_for_model,label=y_train,categorical_feature=cat_feats)
test_data=lgb.Dataset(X_test_for_model,label=y_test,categorical_feature=cat_feats)
elif transform_dataset == 'no_transformation':
train_data=lgb.Dataset(X_train_for_model,label=y_train)
test_data=lgb.Dataset(X_test_for_model,label=y_test)
else:
train_data=lgb.Dataset(X_train_for_model,label=y_train)
test_data=lgb.Dataset(X_test_for_model,label=y_test)
params = {'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'objective': 'binary',
'feature_fraction': 0.5,
'num_leaves': 15,
'max_depth': 10,
'n_estimators': 200,
'min_data_in_leaf': 200,
'min_child_weight': 0.1,
'reg_alpha': 2,
'reg_lambda': 5,
'subsample': 0.8,
'verbose': -1,
}
In [34]:
lgbm = lgb.train(params,
train_data,
2500,
valid_sets=test_data,
early_stopping_rounds= 100,
verbose_eval= 20
)
y_prob = lgbm.predict(X_for_model)
y_pred = y_prob.round(0)
clf_roc_auc_score = roc_auc_score(y, y_prob)
clf_accuracy_score = accuracy_score(y, y_pred)
print('Model overall ROC AUC score: {:.3f}'.format(clf_roc_auc_score))
print('Model overall accuracy: {:.3f}'.format(clf_accuracy_score))
In [35]:
# Verify if the model has predicted a value between 1 and 0
print('Min value of prediction: {:.3f}'.format(y_pred.min()))
print('Max value of prediction: {:.3f}'.format(y_pred.max()))
print('Min value of probability: {:.3f}'.format(y_prob.min()))
print('Max value of probability: {:.3f}'.format(y_prob.max()))
In [36]:
# Getting all the accuracy metrics
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
sensitivity = tp / (tp+fn) # Recall.
specificity = tn / (tn+fp)
precision = tp / (tp+fp)
print('Sensitivity/Recall: %.2f' % sensitivity)
print('Specificity: %.2f' % specificity)
print('Precision: %.2f' % precision)
In [37]:
def plot_confusion_matrix(y, y_pred, normalize_str, figsize_w, figsize_h, filename):
"""
Plot the confusion matrix of a classifier.
"""
plt.figure(figsize=(figsize_w,figsize_h))
plt.title('Confusion matrix')
cm = confusion_matrix(y, y_pred, normalize=normalize_str)
df_cm = pd.DataFrame(cm, columns=np.unique(y), index = np.unique(y))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
sns.set(font_scale=1.4)
sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})
plt.savefig(filename)
plt.show()
return
In [38]:
plot_confusion_matrix(y=y, y_pred=y_pred, normalize_str='true', figsize_w=4, figsize_h=4, filename='./ConfusionMatrix.png')
In [39]:
# True positive rate and false positive rate.
fpr, tpr, _ = roc_curve(y, y_prob)
In [40]:
def plot_roc_curve(fpr, tpr, clf_name, figsize_w, figsize_h, filename):
"""
Plot the ROC curve of a classifier.
"""
plt.figure(figsize=(figsize_w,figsize_h))
sns.set(style="whitegrid")
plt.plot([0, 1], [0, 1], 'k--', label='random')
plt.plot(fpr, tpr, label=clf_name)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.savefig(filename)
plt.show()
return
In [41]:
plot_roc_curve(fpr=fpr, tpr=tpr, clf_name='LightGBM', figsize_w=6, figsize_h=6, filename='./ROCCurve.png')
In [42]:
# Saving results in a fresh dataframe
df_out = pd.DataFrame()
df_out['y_pred'] = y_pred
df_out['y_prob'] = y_prob