import pandas as pd
= "../data/Retail_Transactions_Dataset.csv"
fp = pd.read_csv(fp) df
Data Preparation Notebook
The data for this notebook is from Kaggle. The data looks like it is synthetic, the characteristics observed in the dataset also seem to suggest that. The objectives for this notebook are to:
- Understand the attributes in the dataset, thier purpose and range of values
- Understand how transactions are represented
- Extract transactions involving an ice cream purchase
- Aggregate the ice cream transactions at a daily cadence
- Extract the maximum daily sales of ice cream for each week of the analysis period
df.head()
Profile the categorical columns
df.columns
= ['Payment_Method', 'City', 'Store_Type',\
category_cols 'Discount_Applied', 'Customer_Category', 'Season', 'Promotion']
= df[category_cols].astype('category') df[category_cols]
"Date"] = pd.to_datetime(df["Date"]) df[
df.dtypes
"Customer_Category"].value_counts() df[
"City"].value_counts() df[
Observation
The counts for each of the metro areas are very similar, the counts for each of the customer categories are very similar, so this dataset was probably synthetically generated.
"Product"] = df["Product"].apply(eval) df[
= {}
purchase_summ for index, row in df["Product"].items():
for p in row:
if p in purchase_summ:
+= 1
purchase_summ[p] else:
= 1
purchase_summ[p]
Extract the ice cream transactions
def is_ice_cream(row):
for p in row:
if p == "Ice Cream":
return True
return False
"is_ice_cream"] = df["Product"].apply(is_ice_cream) df[
= df[df["is_ice_cream"]].reset_index()
df_ice_cream_trans = ["Date"]
req_cols = df_ice_cream_trans[req_cols] df_ice_cream_trans
"ice_cream_purchases"] = 1 df_ice_cream_trans[
= df_ice_cream_trans.set_index("Date").resample("D").sum() df_daily_ice_cream_sales
= df_daily_ice_cream_sales.assign(year=df_daily_ice_cream_sales.index.year,\
df_weekly_max_ice_cream_sales = df_daily_ice_cream_sales.index.isocalendar().week).groupby(['year','week']).max() week
= df_daily_ice_cream_sales.assign(year=df_daily_ice_cream_sales.index.year,\
df_weekly_mean_ice_cream_sales = df_daily_ice_cream_sales.index.isocalendar().week).groupby(['year','week']).mean() week
= pd.DataFrame(df_weekly_max_ice_cream_sales.to_records()) df_weekly_max_ice_cream_sales
df_weekly_max_ice_cream_sales
df_weekly_mean_ice_cream_sales
Write the extracted files for further analysis
= "../data/daily_ice_cream_sales.csv"
fpdaily = "../data/max_weekly_ice_cream_sales.csv"
fpmaxweekly = "../data/mean_weekly_ice_cream_sales.csv"
fpmeanweekly =True)
df_daily_ice_cream_sales.to_csv(fpdaily, index=True)
df_weekly_max_ice_cream_sales.to_csv(fpmaxweekly, index=True) df_weekly_mean_ice_cream_sales.to_csv(fpmeanweekly, index