import pandas as pd
fp = "../data/Retail_Transactions_Dataset.csv"
df = pd.read_csv(fp)Data Preparation Notebook
The data for this notebook is from Kaggle. The data looks like it is synthetic, the characteristics observed in the dataset also seem to suggest that. The objectives for this notebook are to:
- Understand the attributes in the dataset, thier purpose and range of values
 - Understand how transactions are represented
 - Extract transactions involving an ice cream purchase
 - Aggregate the ice cream transactions at a daily cadence
 - Extract the maximum daily sales of ice cream for each week of the analysis period
 
df.head()Profile the categorical columns
df.columnscategory_cols = ['Payment_Method', 'City', 'Store_Type',\
       'Discount_Applied', 'Customer_Category', 'Season', 'Promotion']df[category_cols] = df[category_cols].astype('category')df["Date"] = pd.to_datetime(df["Date"])df.dtypesdf["Customer_Category"].value_counts()df["City"].value_counts()Observation
The counts for each of the metro areas are very similar, the counts for each of the customer categories are very similar, so this dataset was probably synthetically generated.
df["Product"] = df["Product"].apply(eval)purchase_summ = {}
for index, row in df["Product"].items():
    for p in row:
        if p in purchase_summ:
            purchase_summ[p] += 1
        else:
            purchase_summ[p] = 1
        Extract the ice cream transactions
def is_ice_cream(row):
    for p in row:
        if p == "Ice Cream":
            return True
    return False
df["is_ice_cream"] = df["Product"].apply(is_ice_cream)df_ice_cream_trans = df[df["is_ice_cream"]].reset_index()
req_cols = ["Date"]
df_ice_cream_trans = df_ice_cream_trans[req_cols]df_ice_cream_trans["ice_cream_purchases"] = 1df_daily_ice_cream_sales = df_ice_cream_trans.set_index("Date").resample("D").sum()df_weekly_max_ice_cream_sales = df_daily_ice_cream_sales.assign(year=df_daily_ice_cream_sales.index.year,\
                                week = df_daily_ice_cream_sales.index.isocalendar().week).groupby(['year','week']).max()df_weekly_mean_ice_cream_sales = df_daily_ice_cream_sales.assign(year=df_daily_ice_cream_sales.index.year,\
                                week = df_daily_ice_cream_sales.index.isocalendar().week).groupby(['year','week']).mean()df_weekly_max_ice_cream_sales = pd.DataFrame(df_weekly_max_ice_cream_sales.to_records()) df_weekly_max_ice_cream_salesdf_weekly_mean_ice_cream_salesWrite the extracted files for further analysis
fpdaily = "../data/daily_ice_cream_sales.csv"
fpmaxweekly = "../data/max_weekly_ice_cream_sales.csv"
fpmeanweekly = "../data/mean_weekly_ice_cream_sales.csv"
df_daily_ice_cream_sales.to_csv(fpdaily, index=True)
df_weekly_max_ice_cream_sales.to_csv(fpmaxweekly, index=True)
df_weekly_mean_ice_cream_sales.to_csv(fpmeanweekly, index=True)