Dependency Check

import kagglehub
from kagglehub import KaggleDatasetAdapter
import kagglehub

# Download latest version
path = kagglehub.dataset_download("matthieugimbert/french-bakery-daily-sales")

print("Path to dataset files:", path)

import pandas as pd
# kagglehub, please make this simpler, provide some example, really poor documentation for the api"
fp = path + "/Bakery Sales.csv"
df = pd.read_csv(fp)
cols = df.columns.tolist()
cols.remove("Unnamed: 0")
df = df[cols]
df.head()

filter_baguette = (df.article == "BAGUETTE")
df["datetime"] = pd.to_datetime(df["date"]  + " " + df["time"])
df_baguette = df[filter_baguette]
cols = ["datetime", "Quantity"]
df_baguette = df_baguette[cols]

df_daily_baugette = df_baguette.set_index("datetime").resample("D").sum()
df_weekly_baugette = df_baguette.set_index("datetime").resample("W").sum()

df_weekly_baugette = df_weekly_baugette.reset_index()

df_weekly_baugette

zero_counts = (df_daily_baugette.Quantity == 0)
df_daily_baugette[zero_counts].shape

df_daily_baugette = df_daily_baugette.reset_index()

df_daily_baugette["Day"] = df_daily_baugette.index
df_daily_baugette["DOW"] = df_daily_baugette.datetime.dt.day_of_week
df_daily_baugette["month"] = df_daily_baugette.datetime.dt.month

df_daily_baugette["WOY"] = df_daily_baugette.datetime.dt.isocalendar().week

cols = ["Day","datetime", "DOW", "month", "WOY", "Quantity"]
df_daily_baugette = df_daily_baugette[cols]

df_daily_baugette.shape

from matplotlib import pyplot as plt

plt.figure(figsize=(12, 6))
df_daily_baugette["Quantity"].plot()

The purpose of the STL decomposition is to identify underlying components of the series. The purpose of the auto-correlation plots is to get another perspective on the dependence structure. This tells us the variance components that we have to account for if we are going to build a forecasting model. Note that for demand estimation, we are not actually forecasting anything, we need a statistical characterization of the demand independent of the sequence order of demand arrival, we only care that the net demand is right, we don’t care how it arrives. In forecasting, we do and we have to account for it in the model.

from statsmodels.tsa.seasonal import STL
daily_baugette_sales = pd.Series(df_daily_baugette["Quantity"].values, index=df_daily_baugette["datetime"])
stl = STL(daily_baugette_sales, period=7)
res = stl.fit()

decomp_res = {"Trend": res._trend, "Seasonality": res._seasonal, "Noise": res._resid}
df_res = pd.DataFrame.from_dict(decomp_res, orient="columns")
df_res = df_res.reset_index()

df_res["Day"] = df_res.index + 1

df_res

Zero Check

Just want to check if it is a count process dominated by zeros, or if zeros are small in number.

zero_counts = (df_daily_baugette.Quantity == 0)
df_daily_baugette[zero_counts].shape

# Using plotly.express
import plotly.express as px
fig = px.line(df_daily_baugette, x='datetime', y="Quantity")
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()

# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Trend", title="Trend Cycle Component of Daily Baugette Sales",
             labels = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()

# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Seasonality", title="Seasonality Component of Daily Baugette Sales",
             labels = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()

# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Noise", title="Noise Component of Daily Baugette Sales",
             labels = {"Noise": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()

df_res["Trend"].plot.kde()
plt.grid(True)

df_res["Seasonality"].plot.kde()
plt.grid(True)

import statsmodels.api as sm

plt.rc("figure", figsize=(12,8))
acf_plot = sm.graphics.tsa.plot_acf(df_daily_baugette["Quantity"] , lags=40)
plt.grid(True)

plt.rc("figure", figsize=(12,8))
pacf_plot = sm.graphics.tsa.plot_pacf(df_daily_baugette["Quantity"], lags=40, method="ywm")
plt.grid(True)

df_baguette = df_baguette.reset_index(drop=True)

df_baguette = df_baguette.rename(columns={"index": "Day"})

df_baguette["Day"] = df_baguette.index + 1
df_baguette["DOW"] = df_baguette.datetime.dt.day_of_week
df_baguette["month"] = df_baguette.datetime.dt.month

df_daily_baugette["CWOY"] = df_daily_baugette.apply(lambda x: x["WOY"] + 53 if x.datetime.year == 2022 else x["WOY"], axis=1)

df_daily_baugette["CWOY"].max()

df_daily_baugette

fp = "../data/daily_baugette_sales.csv"
df_daily_baugette.to_csv(fp, index=False)

df_period_perf  = pd.pivot_table(df_daily_baugette, index= "CWOY", columns= "DOW", values = "Quantity", fill_value=0)

HOURS_OPEN = 1
df_period_perf = df_period_perf.apply(lambda x: x.div(HOURS_OPEN), axis=1).round(3).reset_index()

df_period_perf = df_period_perf.melt(id_vars=["CWOY"], value_vars=[i for i in range(7)], value_name="Quantity")

df_period_perf

df_period_perf.Quantity.plot.kde()

# Using plotly.express
import plotly.express as px
fig = px.scatter(df_period_perf, x='CWOY', y="Quantity")
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()