import pandas as pd
import numpy as np
# Step 1: Read a pandas DataFrame
# For demonstration, we will create a sample dataframe. You can replace this with your own CSV or data source.
fp = "../data/Retail_Transactions_Dataset.csv"
df = pd.read_csv(fp)
# Step 2: Define lists for categorical columns
categorical_columns = ['Payment_Method', 'City', 'Store_Type',\
'Discount_Applied', 'Customer_Category', 'Season', 'Promotion']
# Step 3: Define your timestamp column
timestamp_column = 'Date'
# Step 4: Set the type of the categorical columns to 'category'
for col in categorical_columns:
df[col] = df[col].astype('category')
# Step 5: Set the type of the timestamp column to datetime
df[timestamp_column] = pd.to_datetime(df[timestamp_column])
# Step 6: Function to check for a sentinel value (for this example, let's assume the sentinel value is 0)
def contains_sentinel_value(string_list):
# Convert the string representation of a list back to an actual list
try:
actual_list = eval(string_list)
return 0 in actual_list # Check for sentinel value
except:
return False # In case of any errors, return False
# Step 7: Filter the DataFrame to rows that only contain the sentinel value
df['is_ice_cream'] = df["Product"].apply(contains_sentinel_value)
filtered_df = df[df['ice_cream']]
# Step 8: Drop all columns except the timestamp column in the filtered DataFrame
filtered_df = filtered_df[[timestamp_column]]
# Step 9: Define a new column in the filtered DataFrame that is set to the value 1
filtered_df['is_ice_cream'] = 1
# Step 10: Set the index of the filtered DataFrame to the timestamp column
filtered_df.set_index(timestamp_column, inplace=True)
# Step 11: Resample the DataFrame on the timestamp column and sum the new column
# Assuming we want to sum by minute, you can change the frequency as needed
resampled_df = filtered_df.resample('T').sum()
# Display the final resampled DataFrame
print(resampled_df)