import kagglehub
from kagglehub import KaggleDatasetAdapter
import kagglehub
# Download latest version
= kagglehub.dataset_download("matthieugimbert/french-bakery-daily-sales")
path
print("Path to dataset files:", path)
Dependency Check
import pandas as pd
# kagglehub, please make this simpler, provide some example, really poor documentation for the api"
= path + "/Bakery Sales.csv"
fp = pd.read_csv(fp)
df = df.columns.tolist()
cols "Unnamed: 0")
cols.remove(= df[cols]
df df.head()
= (df.article == "BAGUETTE")
filter_baguette "datetime"] = pd.to_datetime(df["date"] + " " + df["time"])
df[= df[filter_baguette]
df_baguette = ["datetime", "Quantity"]
cols = df_baguette[cols] df_baguette
= df_baguette.set_index("datetime").resample("D").sum()
df_daily_baugette = df_baguette.set_index("datetime").resample("W").sum() df_weekly_baugette
= df_weekly_baugette.reset_index() df_weekly_baugette
df_weekly_baugette
= (df_daily_baugette.Quantity == 0)
zero_counts df_daily_baugette[zero_counts].shape
= df_daily_baugette.reset_index() df_daily_baugette
"Day"] = df_daily_baugette.index
df_daily_baugette["DOW"] = df_daily_baugette.datetime.dt.day_of_week
df_daily_baugette["month"] = df_daily_baugette.datetime.dt.month df_daily_baugette[
"WOY"] = df_daily_baugette.datetime.dt.isocalendar().week df_daily_baugette[
= ["Day","datetime", "DOW", "month", "WOY", "Quantity"]
cols = df_daily_baugette[cols] df_daily_baugette
df_daily_baugette.shape
from matplotlib import pyplot as plt
=(12, 6))
plt.figure(figsize"Quantity"].plot() df_daily_baugette[
The purpose of the STL decomposition is to identify underlying components of the series. The purpose of the auto-correlation plots is to get another perspective on the dependence structure. This tells us the variance components that we have to account for if we are going to build a forecasting model. Note that for demand estimation, we are not actually forecasting anything, we need a statistical characterization of the demand independent of the sequence order of demand arrival, we only care that the net demand is right, we don’t care how it arrives. In forecasting, we do and we have to account for it in the model.
from statsmodels.tsa.seasonal import STL
= pd.Series(df_daily_baugette["Quantity"].values, index=df_daily_baugette["datetime"])
daily_baugette_sales = STL(daily_baugette_sales, period=7)
stl = stl.fit() res
= {"Trend": res._trend, "Seasonality": res._seasonal, "Noise": res._resid}
decomp_res = pd.DataFrame.from_dict(decomp_res, orient="columns")
df_res = df_res.reset_index() df_res
"Day"] = df_res.index + 1 df_res[
df_res
Zero Check
Just want to check if it is a count process dominated by zeros, or if zeros are small in number.
= (df_daily_baugette.Quantity == 0)
zero_counts df_daily_baugette[zero_counts].shape
# Using plotly.express
import plotly.express as px
= px.line(df_daily_baugette, x='datetime', y="Quantity")
fig
fig.update_layout(=False,
autosize=1100,
width=800,
height
) fig.show()
# Using plotly.express
import plotly.express as px
= px.line(df_res, x='datetime', y="Trend", title="Trend Cycle Component of Daily Baugette Sales",
fig = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
labels
fig.update_layout(=False,
autosize=1100,
width=800,
height
) fig.show()
# Using plotly.express
import plotly.express as px
= px.line(df_res, x='datetime', y="Seasonality", title="Seasonality Component of Daily Baugette Sales",
fig = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
labels
fig.update_layout(=False,
autosize=1100,
width=800,
height
) fig.show()
# Using plotly.express
import plotly.express as px
= px.line(df_res, x='datetime', y="Noise", title="Noise Component of Daily Baugette Sales",
fig = {"Noise": "Number of Baugettes", "datetime": "date"})# Using plotly.express
labels
fig.update_layout(=False,
autosize=1100,
width=800,
height
) fig.show()
"Trend"].plot.kde()
df_res[True) plt.grid(
"Seasonality"].plot.kde()
df_res[True) plt.grid(
import statsmodels.api as sm
"figure", figsize=(12,8))
plt.rc(= sm.graphics.tsa.plot_acf(df_daily_baugette["Quantity"] , lags=40)
acf_plot True) plt.grid(
"figure", figsize=(12,8))
plt.rc(= sm.graphics.tsa.plot_pacf(df_daily_baugette["Quantity"], lags=40, method="ywm")
pacf_plot True) plt.grid(
= df_baguette.reset_index(drop=True) df_baguette
= df_baguette.rename(columns={"index": "Day"}) df_baguette
"Day"] = df_baguette.index + 1
df_baguette["DOW"] = df_baguette.datetime.dt.day_of_week
df_baguette["month"] = df_baguette.datetime.dt.month df_baguette[
"CWOY"] = df_daily_baugette.apply(lambda x: x["WOY"] + 53 if x.datetime.year == 2022 else x["WOY"], axis=1) df_daily_baugette[
"CWOY"].max() df_daily_baugette[
df_daily_baugette
= "../data/daily_baugette_sales.csv"
fp =False) df_daily_baugette.to_csv(fp, index
= pd.pivot_table(df_daily_baugette, index= "CWOY", columns= "DOW", values = "Quantity", fill_value=0) df_period_perf
= 1
HOURS_OPEN = df_period_perf.apply(lambda x: x.div(HOURS_OPEN), axis=1).round(3).reset_index() df_period_perf
= df_period_perf.melt(id_vars=["CWOY"], value_vars=[i for i in range(7)], value_name="Quantity") df_period_perf
df_period_perf
df_period_perf.Quantity.plot.kde()
# Using plotly.express
import plotly.express as px
= px.scatter(df_period_perf, x='CWOY', y="Quantity")
fig
fig.update_layout(=False,
autosize=1100,
width=800,
height
) fig.show()