Instacart - Feature Engineering
In this notebook we will mostly use Pandas to engineer features from the Instacart Market Basket Analysis Kaggle competition. In the original Kaggle competition, the model had to predict the products that would be reordered in the last order for each customer.
In this notebook we will engineer features in such a way to try and build a simple reordered prediction model in which the time series data is flattened, so to speak, so we do not have to use computationally expensive recurrent neural network architectures such as LSTMs.
Support Functions
from IPython.display import Markdown, display
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
#----------------------------------------------------------------------
# Functions to load datasets into memory using space efficient data types.
def load_orders(path):
def convert_eval_set(value):
if 'prior' == value:
return np.uint8(1)
elif 'train' == value:
return np.uint8(2)
else:
return np.uint8(3) # 'test'
def convert_days_since_prior_order(value):
# 30 is the maximum value
if '' == value:
return np.int8(-1)
else:
return np.int8(np.float(value))
orders = pd.read_csv(path,
dtype={'order_id': np.uint32,
'user_id': np.uint32,
'order_number': np.uint8,
'order_dow': np.uint8,
'order_hour_of_day': np.uint8},
converters={'eval_set':convert_eval_set,
'days_since_prior_order':convert_days_since_prior_order})
orders = orders.astype({'eval_set': np.uint8,
'days_since_prior_order': np.int8})
return orders
def load_orders_prods(path):
return pd.read_csv(path, dtype={'order_id': np.uint32,
'product_id': np.uint32,
'add_to_cart_order': np.uint8,
'reordered': np.uint8})
def load_products(path):
return pd.read_csv(path, dtype={'product_id': np.uint16,
'aisle_id': np.uint8,
'department_id': np.uint8})
def load_aisles(path):
return pd.read_csv(path, dtype={'aisle_id': np.uint8})
def load_depts(path):
return pd.read_csv(path, dtype={'department_id': np.uint8})
#----------------------------------------------------------------------
# Functions to retrieve orders and products for a specific customer
def GetOrdersList(user_id, exclude_first_order=False):
print("Retrieving orders for user ID: {0}".format(user_id))
if exclude_first_order:
return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior') & (orders.order_number > 1)].sort_values('order_number', ascending=True).order_id
else:
return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior')].sort_values('order_number', ascending=True).order_id
def GetProductsForOrder(order_id, prior_orders=True):
if prior_orders:
return prior_prods[prior_prods.order_id == order_id]
else:
return last_prods[last_prods.order_id == order_id]
def GetProductsForAllOrdersByCust(user_id, unique_products_only=False, exclude_first_order=False):
orders_list = GetOrdersList(user_id, exclude_first_order)
if unique_products_only:
return prior_prods[prior_prods.order_id.isin(orders_list)].product_id.unique()
else:
return prior_prods[prior_prods.order_id.isin(orders_list)]
#----------------------------------------------------------------------
# Function to generate markdown output
# Ref: https://stackoverflow.com/a/32035217
def printmd(string):
display(Markdown(string))
Load Training Data
orders = load_orders('data/split/sf_val_set_orders.csv')
orders_prods = load_orders_prods('data/split/sf_val_set_prior_order_products.csv')
products = load_products('data/original/products.csv')
aisles = load_aisles('data/original/aisles.csv')
depts = load_depts('data/original/departments.csv')
# List of orders in history
prior_orders_only = orders[(1 == orders.eval_set)]
final_orders_only = orders[(1 != orders.eval_set)]
Meta Features - Mean Order Length and Mean Reorder Ratio Per Customer
# Compute mean order length per customer.
orders_length = orders_prods.groupby('order_id').add_to_cart_order.max().reset_index()
orders_length.rename(columns={'add_to_cart_order': 'total_items_ordered'}, inplace=True)
orders_length_merge = orders_length.merge(prior_orders_only[['order_id','user_id']], on='order_id')
orders_length_merge['order_id'] = orders_length_merge.order_id.astype(np.uint32)
mean_order_length_per_customer = orders_length_merge.groupby('user_id').total_items_ordered.mean().round().reset_index()
mean_order_length_per_customer['user_id'] = mean_order_length_per_customer.user_id.astype(np.uint32)
mean_order_length_per_customer.rename(columns={'total_items_ordered': 'mean_order_length'}, inplace=True)
mean_order_length_per_customer['mean_order_length'] = mean_order_length_per_customer.mean_order_length.astype(np.uint16)
del orders_length_merge
# Compute mean reorder ratio per customer.
# For each order compute ratio of re-ordered items to total ordered items.
orders_reorder_ratio = orders_prods.groupby('order_id').reordered.sum() / orders_length.set_index('order_id').total_items_ordered
orders_reorder_ratio = orders_reorder_ratio.reset_index()
del orders_length
# Exclude first orders, since none of the products have been ordered yet,
# and so reordered ratio would be zero, thus skewing the mean reorder ratio
# both overall and per user.
orders_reorder_ratio = orders_reorder_ratio.merge(prior_orders_only[prior_orders_only.order_number > 1], on='order_id')
orders_reorder_ratio.rename(columns={0: 'reorder_ratio'}, inplace=True)
orders_reorder_ratio['order_id'] = orders_reorder_ratio.order_id.astype(np.uint32)
mean_reorder_ratio_per_customer = orders_reorder_ratio.groupby('user_id').reorder_ratio.mean().reset_index()
mean_reorder_ratio_per_customer['user_id'] = mean_reorder_ratio_per_customer.user_id.astype(np.uint32)
mean_reorder_ratio_per_customer.rename(columns={'reorder_ratio': 'mean_reorder_ratio'}, inplace=True)
mean_reorder_ratio_per_customer['mean_reorder_ratio'] = mean_reorder_ratio_per_customer.mean_reorder_ratio.astype(np.float16)
del orders_reorder_ratio
mean_order_length_per_customer.head()
user_id | mean_order_length | |
---|---|---|
0 | 1 | 6 |
1 | 2 | 14 |
2 | 5 | 9 |
3 | 7 | 10 |
4 | 8 | 16 |
mean_reorder_ratio_per_customer.head()
user_id | mean_reorder_ratio | |
---|---|---|
0 | 1 | 0.784180 |
1 | 2 | 0.482422 |
2 | 5 | 0.503906 |
3 | 7 | 0.793945 |
4 | 8 | 0.458984 |
Feature Engineering
Merge Prior Orders, Ordered Products, Aisles and Departments
flat_order_prods = orders_prods.merge(prior_orders_only, on='order_id')
flat_order_prods = flat_order_prods.merge(products[['product_id','aisle_id','department_id']], on='product_id')
flat_order_prods.head()
order_id | product_id | add_to_cart_order | reordered | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | aisle_id | department_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 40462 | 1 | 0 | 22352 | 1 | 4 | 1 | 12 | 30 | 31 | 7 |
1 | 21553 | 40462 | 1 | 1 | 31136 | 1 | 13 | 1 | 13 | 25 | 31 | 7 |
2 | 208931 | 40462 | 1 | 1 | 7137 | 1 | 4 | 2 | 9 | 6 | 31 | 7 |
3 | 424301 | 40462 | 1 | 1 | 31136 | 1 | 7 | 2 | 11 | 12 | 31 | 7 |
4 | 455382 | 40462 | 20 | 0 | 33665 | 1 | 2 | 0 | 19 | 7 | 31 | 7 |
Product Probabilities per Customer
product_probs_per_cust = flat_order_prods.groupby(['user_id','product_id']).reordered.sum() + 1
product_probs_per_cust = product_probs_per_cust / product_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
product_probs_per_cust = product_probs_per_cust.reset_index()
product_probs_per_cust['user_id'] = product_probs_per_cust.user_id.astype(np.uint32)
product_probs_per_cust['product_id'] = product_probs_per_cust.product_id.astype(np.uint32)
product_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
product_probs_per_cust.head()
user_id | product_id | prob | |
---|---|---|---|
0 | 1 | 196 | 0.169492 |
1 | 1 | 10258 | 0.152542 |
2 | 1 | 10326 | 0.016949 |
3 | 1 | 12427 | 0.169492 |
4 | 1 | 13032 | 0.050847 |
Aisle Probabilities per Customer
aisle_probs_per_cust = flat_order_prods.groupby(['user_id','aisle_id']).reordered.sum() + 1
aisle_probs_per_cust = aisle_probs_per_cust / aisle_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
aisle_probs_per_cust = aisle_probs_per_cust.reset_index()
aisle_probs_per_cust['user_id'] = aisle_probs_per_cust.user_id.astype(np.uint32)
aisle_probs_per_cust['aisle_id'] = aisle_probs_per_cust.aisle_id.astype(np.uint8)
aisle_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
aisle_probs_per_cust.head()
user_id | aisle_id | prob | |
---|---|---|---|
0 | 1 | 21 | 0.150943 |
1 | 1 | 23 | 0.207547 |
2 | 1 | 24 | 0.037736 |
3 | 1 | 45 | 0.018868 |
4 | 1 | 53 | 0.037736 |
Department Probabilities per Customer
dept_probs_per_cust = flat_order_prods.groupby(['user_id','department_id']).reordered.sum() + 1
dept_probs_per_cust = dept_probs_per_cust / dept_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
dept_probs_per_cust = dept_probs_per_cust.reset_index()
dept_probs_per_cust['user_id'] = dept_probs_per_cust.user_id.astype(np.uint32)
dept_probs_per_cust['department_id'] = dept_probs_per_cust.department_id.astype(np.uint8)
dept_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
dept_probs_per_cust.head()
user_id | department_id | prob | |
---|---|---|---|
0 | 1 | 4 | 0.041667 |
1 | 1 | 7 | 0.250000 |
2 | 1 | 13 | 0.020833 |
3 | 1 | 14 | 0.062500 |
4 | 1 | 16 | 0.187500 |
Days Since First Order (DSFO) per Order per Customer
DSFO_popc = prior_orders_only.copy()
# days since first order per order per customer
# add one since each users' first order has days_since_prior_order set to -1.
DSFO_popc['DSFO'] = DSFO_popc.groupby(['user_id']).days_since_prior_order.cumsum() + 1
DSFO_popc['DSFO'] = DSFO_popc.DSFO.astype(np.uint16)
del DSFO_popc['eval_set']
del DSFO_popc['order_number']
del DSFO_popc['order_dow']
del DSFO_popc['order_hour_of_day']
del DSFO_popc['days_since_prior_order']
DSFO_popc.head()
order_id | user_id | DSFO | |
---|---|---|---|
0 | 2539329 | 1 | 0 |
1 | 2398795 | 1 | 15 |
2 | 473747 | 1 | 36 |
3 | 2254736 | 1 | 65 |
4 | 431534 | 1 | 93 |
Max Days Since First Order (DSFO) per Customer
max_DSFO_pc = DSFO_popc.groupby(['user_id']).DSFO.max().reset_index()
max_DSFO_pc.rename(columns={'DSFO': 'max_DSFO'}, inplace=True)
max_DSFO_pc.head()
user_id | max_DSFO | |
---|---|---|
0 | 1 | 176 |
1 | 2 | 198 |
2 | 5 | 40 |
3 | 7 | 203 |
4 | 8 | 60 |
Number of Orders per Customer
orders_pc = prior_orders_only.groupby('user_id').order_number.max().reset_index()
orders_pc['user_id'] = orders_pc.user_id.astype(np.uint32)
orders_pc.rename(columns={'order_number': 'number_of_orders'}, inplace=True)
orders_pc.head()
user_id | number_of_orders | |
---|---|---|
0 | 1 | 10 |
1 | 2 | 14 |
2 | 5 | 4 |
3 | 7 | 20 |
4 | 8 | 3 |
Final Summary for Products Ordered per Customer - Training Data
# days since first order per product per order per customer
props_pppc = flat_order_prods[['order_id','product_id','aisle_id','department_id','reordered']].merge(DSFO_popc, on="order_id")
# aggregate to get properties for each product ordered for each customer
props_pppc = props_pppc.groupby(['user_id','product_id']).agg({'DSFO': [min, max],
'reordered': sum,
'aisle_id': max,
'department_id': max})
# flatten hierarchical column index
props_pppc = props_pppc.reset_index()
props_pppc.columns = ['_'.join(col).strip('_') for col in props_pppc.columns.values]
# add max_DSFO and total orders per customer
props_pppc = props_pppc.merge(max_DSFO_pc, on='user_id')
props_pppc = props_pppc.merge(orders_pc, on='user_id')
# change data types for space efficiency
props_pppc['user_id'] = props_pppc.user_id.astype(np.uint32)
props_pppc['product_id'] = props_pppc.product_id.astype(np.uint32)
# rename columns to more proper name following aggregation
props_pppc.rename(columns={'aisle_id_max': 'aisle_id'}, inplace=True)
props_pppc.rename(columns={'department_id_max': 'department_id'}, inplace=True)
# add product probabilities per customer
props_pppc = props_pppc.merge(product_probs_per_cust, on=['user_id','product_id'])
props_pppc.rename(columns={'prob': 'prod_prob'}, inplace=True)
# add aisle probabilities per customer
props_pppc = props_pppc.merge(aisle_probs_per_cust, on=['user_id','aisle_id'])
props_pppc.rename(columns={'prob': 'aisle_prob'}, inplace=True)
# add department probabilities per customer
props_pppc = props_pppc.merge(dept_probs_per_cust, on=['user_id','department_id'])
props_pppc.rename(columns={'prob': 'department_prob'}, inplace=True)
# add days since last order for the customer's final order
props_pppc = props_pppc.merge(final_orders_only[['user_id','days_since_prior_order']], on="user_id")
props_pppc.rename(columns={'days_since_prior_order': 'last_order_DSLO'}, inplace=True)
# compute reorder and recency probability along with the mean days to order each product.
props_pppc['reorder_prob'] = (props_pppc['reordered_sum'] + 1) / props_pppc['number_of_orders']
# check that DSFO_max is greater than zero to avoid NaN, since some customers might have only
# multiple orders on same day that the first order was placed.
props_pppc['recency_prob'] = (props_pppc['DSFO_max'] / (props_pppc['max_DSFO'] + props_pppc['last_order_DSLO'])).where(props_pppc['last_order_DSLO'] > 0, 0)
# DTOP - days to order product
props_pppc['mean_DTOP'] = ((props_pppc['DSFO_max'] - props_pppc['DSFO_min']) / props_pppc['reordered_sum']).where(props_pppc['DSFO_max'] > props_pppc['DSFO_min'], props_pppc['DSFO_max'])
props_pppc['mean_DTOP'] = props_pppc.mean_DTOP.astype(np.float16)
# compute DTOP probability
props_pppc['DTOP_prob'] = ((props_pppc['max_DSFO'] + props_pppc['last_order_DSLO']) - props_pppc['DSFO_max']) / props_pppc['mean_DTOP']
# replace NaNs with zero probability
props_pppc['DTOP_prob'].fillna(0, inplace=True)
# limit maximum value to 1 since it is a probability
props_pppc['DTOP_prob'] = props_pppc['DTOP_prob'].clip(lower=0, upper=0.9999999)
# change all float64 fields to float16
props_pppc['prod_prob'] = props_pppc.prod_prob.astype(np.float16)
props_pppc['aisle_prob'] = props_pppc.aisle_prob.astype(np.float16)
props_pppc['department_prob'] = props_pppc.department_prob.astype(np.float16)
props_pppc['reorder_prob'] = props_pppc.reorder_prob.astype(np.float16)
props_pppc['recency_prob'] = props_pppc.recency_prob.astype(np.float16)
props_pppc['DTOP_prob'] = props_pppc.DTOP_prob.astype(np.float16)
# add reordered column - setting all to zero - binary field
props_pppc['reordered'] = 0
props_pppc['reordered'] = props_pppc.reordered.astype(np.uint8)
# drop the columns we no longer need
del props_pppc['DSFO_min']
del props_pppc['DSFO_max']
del props_pppc['reordered_sum']
del props_pppc['aisle_id']
del props_pppc['department_id']
del props_pppc['max_DSFO']
del props_pppc['number_of_orders']
del props_pppc['last_order_DSLO']
del props_pppc['mean_DTOP']
props_pppc.head()
user_id | product_id | prod_prob | aisle_prob | department_prob | reorder_prob | recency_prob | DTOP_prob | reordered | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 196 | 0.169434 | 0.22644 | 0.250000 | 1.000000 | 0.926270 | 0.715820 | 0 |
1 | 1 | 46149 | 0.050842 | 0.22644 | 0.250000 | 0.300049 | 0.926270 | 0.933105 | 0 |
2 | 1 | 10258 | 0.152588 | 0.16980 | 0.395752 | 0.899902 | 0.926270 | 0.695801 | 0 |
3 | 1 | 12427 | 0.169434 | 0.20752 | 0.395752 | 1.000000 | 0.926270 | 0.715820 | 0 |
4 | 1 | 26088 | 0.033905 | 0.20752 | 0.395752 | 0.199951 | 0.078918 | 1.000000 | 0 |
Load Last Order Products to Mark Reordered Products per Customer
last_reordered_prods = load_orders_prods('data/split/sf_val_set_last_order_products.csv')
last_reordered_prods = last_reordered_prods[last_reordered_prods.reordered == 1][['order_id','product_id','reordered']]
last_reordered_prods = final_orders_only[['order_id','user_id']].merge(last_reordered_prods, on="order_id")
last_reordered_prods.head()
order_id | user_id | product_id | reordered | |
---|---|---|---|---|
0 | 1187899 | 1 | 196 | 1 |
1 | 1187899 | 1 | 25133 | 1 |
2 | 1187899 | 1 | 38928 | 1 |
3 | 1187899 | 1 | 26405 | 1 |
4 | 1187899 | 1 | 39657 | 1 |
keys = ['user_id','product_id']
i1 = props_pppc.set_index(keys).index
i2 = last_reordered_prods.set_index(keys).index
props_pppc.loc[i1.isin(i2), 'reordered'] = 1
props_pppc.head()
user_id | product_id | prod_prob | aisle_prob | department_prob | reorder_prob | recency_prob | DTOP_prob | reordered | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 196 | 0.169434 | 0.22644 | 0.250000 | 1.000000 | 0.926270 | 0.715820 | 1 |
1 | 1 | 46149 | 0.050842 | 0.22644 | 0.250000 | 0.300049 | 0.926270 | 0.933105 | 1 |
2 | 1 | 10258 | 0.152588 | 0.16980 | 0.395752 | 0.899902 | 0.926270 | 0.695801 | 1 |
3 | 1 | 12427 | 0.169434 | 0.20752 | 0.395752 | 1.000000 | 0.926270 | 0.715820 | 0 |
4 | 1 | 26088 | 0.033905 | 0.20752 | 0.395752 | 0.199951 | 0.078918 | 1.000000 | 1 |
props_pppc.to_csv('validation.csv')