from helpers_strawberries import *
import_all()
plt.style.use('strawberries.mplstyle')
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm


%%html
<style>
a:link {color: #742a40 !important; font-weight: 600 !important;}
a:visited {color: #742a40 !important; font-weight: 600 !important;}
</style>


diet = pd.read_csv("./sarimax_prophet_repo/data/raw/time-series/multiTimeline_diet.csv",
				   skiprows=[0,1], 
				   index_col='Week', 
				   parse_dates=['Week'])

df_overview(diet, 'Google "diet"')


diet.plot(title = 'Google Searches for the Word "diet"', color = 'cyan');


# help(sm.tsa.seasonal_decompose)


import statsmodels.api as sm
decomposition = sm.tsa.seasonal_decompose(diet['diet: (United States)'],
                                         model = 'multiplicative',
                                         period=53)
fig = decomposition.plot();
fig.tight_layout(pad=0.75);


import statsmodels.api as sm
decomposition = sm.tsa.seasonal_decompose(diet['diet: (United States)'],
                                         model = 'additive',
                                         period=53)
fig = decomposition.plot();
fig.tight_layout(pad=0.75);


# help(sm.tsa.stattools.adfuller)


from statsmodels.tsa.stattools import adfuller

def adf_test(timeseries):
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)


# help(sm.tsa.stattools.kpss)


from statsmodels.tsa.stattools import kpss
def kpss_test(timeseries):
    print ('Results of KPSS Test:')
    kpsstest = kpss(timeseries, regression='c', nlags="auto")
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    for key,value in kpsstest[3].items():
        kpss_output['Critical Value (%s)'%key] = value
    print (kpss_output)


adf_test(diet['diet: (United States)'])

Results of Dickey-Fuller Test:
Test Statistic                 -2.86
p-value                         0.05
#Lags Used                      6.00
Number of Observations Used   254.00
Critical Value (1%)            -3.46
Critical Value (5%)            -2.87
Critical Value (10%)           -2.57
dtype: float64


kpss_test(diet['diet: (United States)'])

Results of KPSS Test:
Test Statistic          0.63
p-value                 0.02
Lags Used               9.00
Critical Value (10%)    0.35
Critical Value (5%)     0.46
Critical Value (2.5%)   0.57
Critical Value (1%)     0.74
dtype: float64


def adf_kpss(timeseries, max_d):
	pd.reset_option('display.float_format')

	""" Build dataframe with ADF statistics and p-value for time series after applying difference on time series

	Args:
		time_series (df): Dataframe of univariate time series  
		max_d (int): Max value of how many times apply difference

	Returns:
		Dataframe showing values of ADF statistics and p when applying ADF test after applying d times 
		differencing on a time-series.
	"""

	results=[]

	for idx in range(max_d):
		adf_result = adfuller(timeseries, autolag='AIC')
		kpss_result = kpss(timeseries, regression='c', nlags="auto")
		timeseries = timeseries.diff().dropna()
		if adf_result[1] <=0.05:
			adf_stationary = True
		else:
			adf_stationary = False
		if kpss_result[1] <=0.05:
			kpss_stationary = False
		else:
			kpss_stationary = True

		stationary = adf_stationary & kpss_stationary

		results.append((idx,adf_result[1], kpss_result[1],adf_stationary,kpss_stationary, stationary))

	# Construct DataFrame 
	results_df = pd.DataFrame(results, columns=['d','adf_stats','p-value', 'is_adf_stationary','is_kpss_stationary','is_stationary' ])

	return results_df


adf_kpss(diet, 3)

/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(
/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(


help(pd.DataFrame.diff)

Help on function diff in module pandas.core.frame:

diff(self, periods: 'int' = 1, axis: 'Axis' = 0) -> 'DataFrame'
    First discrete difference of element.
    
    Calculates the difference of a DataFrame element compared with another
    element in the DataFrame (default is element in previous row).
    
    Parameters
    ----------
    periods : int, default 1
        Periods to shift for calculating difference, accepts negative
        values.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Take difference over rows (0) or columns (1).
    
    Returns
    -------
    DataFrame
        First differences of the Series.
    
    See Also
    --------
    DataFrame.pct_change: Percent change over given number of periods.
    DataFrame.shift: Shift index by desired number of periods with an
        optional time freq.
    Series.diff: First discrete difference of object.
    
    Notes
    -----
    For boolean dtypes, this uses :meth:`operator.xor` rather than
    :meth:`operator.sub`.
    The result is calculated according to current dtype in DataFrame,
    however dtype of the result is always float64.
    
    Examples
    --------
    
    Difference with previous row
    
    >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
    ...                    'b': [1, 1, 2, 3, 5, 8],
    ...                    'c': [1, 4, 9, 16, 25, 36]})
    >>> df
       a  b   c
    0  1  1   1
    1  2  1   4
    2  3  2   9
    3  4  3  16
    4  5  5  25
    5  6  8  36
    
    >>> df.diff()
         a    b     c
    0  NaN  NaN   NaN
    1  1.0  0.0   3.0
    2  1.0  1.0   5.0
    3  1.0  1.0   7.0
    4  1.0  2.0   9.0
    5  1.0  3.0  11.0
    
    Difference with previous column
    
    >>> df.diff(axis=1)
        a  b   c
    0 NaN  0   0
    1 NaN -1   3
    2 NaN -1   7
    3 NaN -1  13
    4 NaN  0  20
    5 NaN  2  28
    
    Difference with 3rd previous row
    
    >>> df.diff(periods=3)
         a    b     c
    0  NaN  NaN   NaN
    1  NaN  NaN   NaN
    2  NaN  NaN   NaN
    3  3.0  2.0  15.0
    4  3.0  4.0  21.0
    5  3.0  6.0  27.0
    
    Difference with following row
    
    >>> df.diff(periods=-1)
         a    b     c
    0 -1.0  0.0  -3.0
    1 -1.0 -1.0  -5.0
    2 -1.0 -1.0  -7.0
    3 -1.0 -2.0  -9.0
    4 -1.0 -3.0 -11.0
    5  NaN  NaN   NaN
    
    Overflow in input dtype
    
    >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
    >>> df.diff()
           a
    0    NaN
    1  255.0


fig, axes = plt.subplots(nrows=2, ncols=1, figsize = (12, 8))
axes[0] = diet.plot(ax = axes[0], title = "Original timeseries" )
axes[0].legend(loc = 2)
axes[1] = diet.diff().dropna().plot(ax = axes[1], title = "Stationary timeseries - original timeseries differenced once")
axes[1].legend().remove()
plt.tight_layout()


weather = pd.read_csv("./sarimax_prophet_repo/data/raw/time-series/monthly_csv.csv",index_col='Date', parse_dates=['Date'])
weather.sort_index(inplace=True)
df_overview(weather, 'Global Weather', 4)


gistemp = pd.DataFrame(weather[weather.Source == 'GISTEMP']['Mean'].copy())
gistemp.columns = ['giss_mean']
gcag = pd.DataFrame(weather[weather.Source == 'GCAG']['Mean'].copy())
gcag.columns = ['gcag_mean']


multi([(gistemp.head(5), 'GISTEMP'),
	  (gcag.head(5), 'GCAG')])


df_overview(gistemp, 'GISSTEMP', 3)


df_overview(gcag, 'GCAG', 3)


fig, axes = plt.subplots(2, 1, figsize = (12, 8))
gistemp.plot(ax = axes[0]);
gcag.plot(ax = axes[1]);
plt.tight_layout()
plt.suptitle("Monthly mean temperature anomalies in degrees Celsius relative to a base period", size = 16, y = 1.04);


decomposition_gistemp = sm.tsa.seasonal_decompose(gistemp['giss_mean'],
												 period = 12) 
fig = decomposition_gistemp.plot()


decomposition_gistemp = sm.tsa.seasonal_decompose(gistemp['giss_mean'],
												 period = 240) 
fig = decomposition_gistemp.plot()


adf_test(gistemp)

Results of Dickey-Fuller Test:
Test Statistic                   -0.360964
p-value                           0.916415
#Lags Used                       24.000000
Number of Observations Used    1619.000000
Critical Value (1%)              -3.434396
Critical Value (5%)              -2.863327
Critical Value (10%)             -2.567721
dtype: float64


kpss_test(gistemp)

Results of KPSS Test:
Test Statistic            4.970866
p-value                   0.010000
Lags Used                26.000000
Critical Value (10%)      0.347000
Critical Value (5%)       0.463000
Critical Value (2.5%)     0.574000
Critical Value (1%)       0.739000
dtype: float64

/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2018: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warnings.warn(


decomposition = sm.tsa.seasonal_decompose(gistemp['giss_mean']["2014":"2016"],
										  period = 12)
fig = decomposition.plot()


adf_kpss(gistemp, 3)

/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2018: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warnings.warn(
/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(
/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(


adf_kpss(gcag, 3)

/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2018: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warnings.warn(
/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(
/Users/evancarr/opt/anaconda3/envs/sktime_project_02_05_23/lib/python3.10/site-packages/statsmodels/tsa/stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warnings.warn(


fig, (ax1,ax2) = plt.subplots(2,1)
plt.suptitle("Monthly mean temperature anomalies in degrees Celsius relative to a base period", size = 16)

gistemp.plot(ax=ax1, color = 'cyan')
gcag.plot(ax=ax2, color = 'yellow')
plt.tight_layout()


fig, (ax1,ax2) = plt.subplots(2,1)
plt.suptitle("DIFFERENCED: Monthly mean temperature anomalies in degrees Celsius relative to a base period", size = 14);

gistemp.diff().dropna().plot(ax=ax1, color = 'cyan');
gcag.diff().dropna().plot(ax=ax2, color='yellow');


import warnings
warnings.filterwarnings('ignore')

from statsmodels.tools.sm_exceptions import ConvergenceWarning, InterpolationWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', InterpolationWarning)


def reduce_memory_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


df_train = pd.read_csv("./sarimax_prophet_repo/data/raw/store_item_demand_forecasting/train.csv")
reduce_memory_usage(df_train)
df_overview(df_train, 'Walmart Training Dataset')

Mem. usage decreased to 10.45 Mb (62.5% reduction)


df_test = pd.read_csv("./sarimax_prophet_repo/data/raw/store_item_demand_forecasting/test.csv")
reduce_memory_usage(df_test)
df_overview(df_test, 'Walmart Testing Dataset')

Mem. usage decreased to  0.60 Mb (56.2% reduction)


header_text('Unique Items by Store: Training and Testing')

multi([(df_train.groupby('store').nunique()['item'],
	   'Training Data'),
	  (df_test.groupby('store').nunique()['item'],
	  'Testing Data')])


df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])


train_range = str(df_train['date'].min().date()) + ' to ' + str(df_train['date'].max().date())
test_range = str(df_test['date'].min().date()) + ' to ' + str(df_test['date'].max().date())
pretty(train_range, "Period covered in train dataset:")
pretty(test_range, "Period covered in test dataset:")


pretty(len(df_train.groupby(["store"]).groups.keys()), 'number of stores')
pretty(len(df_train.groupby(["item"]).groups.keys()), 'number of items')
pretty(len(df_train.groupby(["store", "item"]).groups.keys()), 'number of time series')


see(df_train[['store','sales']].groupby('store')\
							   .sum()\
							   .sort_values('sales', ascending=False),
	'Sales Amounts by Store')


plt.figure()
plt.title('Sales by Store');
sns.barplot(data=df_train,x='store',y='sales');


df_store_2 = df_train[df_train.store == 2][['date','item','sales']]
reduce_memory_usage(df_store_2)
see(df_store_2.head(), 'store no. 2 sales')

Mem. usage decreased to  1.65 Mb (0.0% reduction)


see(df_store_2[['item','sales']].groupby('item')\
								.sum()\
								.sort_values('sales', ascending=False)[:5],
	'store no. 2: highest selling items')


plt.figure()
plt.title("Store No. 2: Sales per Item")
sns.barplot(data=df_store_2[['item','sales']].groupby('item').sum().reset_index(),x='item',y='sales');
plt.xticks(size = 10);


df_store_2['year'] = df_store_2['date'].dt.year
df_store_2['month'] = df_store_2['date'].dt.month
df_store_2['day'] = df_store_2['date'].dt.dayofyear
df_store_2['weekday'] = df_store_2['date'].dt.weekday
df_store_2['year-month'] = df_store_2['date'].apply(lambda x: str(x.year)+'-'+str(x.month))


see(df_store_2.sample(5), 'Store No. 2: added datetime features')


plt.title('Store No. 2: Sales by Weekday')
sns.boxplot(x="weekday", y="sales", data=df_store_2);
plt.xticks(ticks = [0, 1, 2, 3, 4, 5, 6],
		  labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']);


plt.title('Store No. 2: Sales by Month')
sns.boxplot(x="month", y="sales", data=df_store_2);
plt.xticks(ticks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
		  labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
					'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']);


plt.figure()
plt.xticks(rotation=45)
plt.title('Sales per Year-Month')
sns.boxplot(x="year-month", y="sales", data=df_store_2);
plt.xticks(size = 8);


sns.lineplot(data=df_store_2, 
             x='month', 
             y='sales', 
             hue='year', 
            palette='cool')

plt.title('Seasonality by Year')
plt.legend(loc=2);


sns.lineplot(data=df_store_2, 
             x='weekday', 
             y='sales', 
             hue='month', 
             legend='full',
            palette='rainbow')

plt.title('Weekday Seasonality by Month')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2);
plt.xticks(ticks = [0, 1, 2, 3, 4, 5, 6],
		  labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']);


sns.lineplot(data=df_store_2, 
             x='weekday', 
             y='sales', 
             hue='year', 
            palette='cool')

plt.title('Weekday Seasonality by Year')
plt.legend(loc=2);
plt.xticks(ticks = [0, 1, 2, 3, 4, 5, 6],
		  labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']);


store2_item1 = df_store_2[['date','sales']]\
						  [df_store_2.item == 1]\
						  .reset_index(drop=True)\
						  .set_index('date')

head_tail_horz(store2_item1, 5, 'Store No. 2 - Item No. 1')


store2_item1.plot(grid=True, color = 'cyan', 
				   title = 'Store No. 2 - Item No. 1');


header_text('Store No. 2 - Item No. 1')
decomposition = sm.tsa.seasonal_decompose(store2_item1, 
                                          model = 'additive',
                                          period=365);
decomposition.plot();
plt.tight_layout();


adf_kpss(store2_item1, 3)


# help(plot_acf)


# help(plot_pacf)


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
plot_acf(store2_item1,lags=14, zero=False, ax=ax1)
plot_pacf(store2_item1,lags=14, zero=False, ax=ax2)
ax1.set_ylim(-0.2, 1); ax2.set_ylim(-0.2, 0.8)
plt.tight_layout()


# Create figure
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
plot_acf(store2_item1.diff().dropna(),lags=14, zero=False, ax=ax1)
plot_pacf(store2_item1.diff().dropna(),lags=14, zero=False, ax=ax2)
ax1.set_ylim(-0.5, 0.8); ax2.set_ylim(-0.8, 0.6)
plt.tight_layout()


# help(SARIMAX)


# help(SARIMAX.fit)


# help(results)


store2_item1.index = pd.DatetimeIndex(store2_item1.index.values,
									   freq=store2_item1.index.inferred_freq)


from statsmodels.tsa.statespace.sarimax import SARIMAX
header_text('ARIMA Model Fitting')
model = SARIMAX(store2_item1, order=(1,1,1), freq='D')
results = model.fit()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.37742D+00    |proj g|=  1.21670D-01

At iterate    5    f=  3.30842D+00    |proj g|=  6.96031D-03

At iterate   10    f=  3.30768D+00    |proj g|=  8.50151D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3     10     12      1     0     0   8.502D-06   3.308D+00
  F =   3.3076756960161755     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL

 This problem is unconstrained.


results = results.summary()


# # Create empty list to store search results
# order_aic_bic=[]

# # Loop over p values from 0-6
# for p in range(7):
#   # Loop over q values from 0-6
#     for q in range(7):
#       	# create and fit ARMA(p,q) model
# 		# because adf test showed that d=1
#         model = SARIMAX(store2_item1, order=(p,1,q), freq="D") 
#         results = model.fit()
        
#         # Append order and results tuple
#         order_aic_bic.append((p,q,results.aic, results.bic))


# Construct DataFrame from order_aic_bic
order_df = pd.DataFrame(order_aic_bic, 
                        columns=['p','q','AIC','BIC'])

order_AIC = order_df.sort_values('AIC')
order_BIC = order_df.sort_values('BIC')


multi([(order_AIC.head(5), 'AIC Best Results'),
	  (order_BIC.head(5), 'BIC Best Results')])


header_text('ARIMA Model Fitting')
model = SARIMAX(store2_item1, order=(4,1,5))
results = model.fit()

 This problem is unconstrained.

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.25847D+00    |proj g|=  5.01528D-01

At iterate    5    f=  3.24105D+00    |proj g|=  7.56448D-02

At iterate   10    f=  3.22557D+00    |proj g|=  6.79302D-01

At iterate   15    f=  3.21356D+00    |proj g|=  5.40317D-02

At iterate   20    f=  3.20500D+00    |proj g|=  3.40172D-01

At iterate   25    f=  3.20100D+00    |proj g|=  2.07851D-01

At iterate   30    f=  3.19762D+00    |proj g|=  1.21599D-01

At iterate   35    f=  3.19641D+00    |proj g|=  2.73309D-01

At iterate   40    f=  3.19246D+00    |proj g|=  2.33485D-01

At iterate   45    f=  3.18906D+00    |proj g|=  3.29806D-01

At iterate   50    f=  3.18849D+00    |proj g|=  1.59965D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   10     50     60      1     0     0   1.600D-01   3.188D+00
  F =   3.1884893223892212     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT


mae = np.mean(np.abs(results.resid))

pretty(f'{mae: .3f}%', 'Mean Absolute Error')


df_overview(store2_item1, 'store2_item1')


results.plot_diagnostics(figsize = (12, 8))
plt.tight_layout()


fig, (ax1, ax2) = plt.subplots(2,1)
plot_acf(store2_item1, lags=14, zero=False, ax=ax1)
plot_pacf(store2_item1, lags=14, zero=False, ax=ax2)
ax1.set_ylim(-0.25, 0.75); ax2.set_ylim(-0.25, 0.6)
plt.tight_layout()


store2_item1_diff = store2_item1.diff().diff(7).dropna()


fig, (ax1, ax2) = plt.subplots(2,1,figsize=(8,6))
plot_acf(store2_item1_diff, lags=14, zero=False, ax=ax1)
plot_pacf(store2_item1_diff, lags=14, zero=False, ax=ax2)
ax1.set_ylim(-0.75, 0.5); ax2.set_ylim(-0.75, 0.5); 
plt.tight_layout()


lags = [7, 14, 21, 28, 35]
fig, (ax1, ax2) = plt.subplots(2,1,figsize=(8,6))
plot_acf(store2_item1_diff, lags=lags, zero=False, ax=ax1)
plot_pacf(store2_item1_diff, lags=lags, zero=False, ax=ax2)
ax1.set_ylim(-0.75, 0.25); ax2.set_ylim(-0.75, 0.25); 
plt.tight_layout()


sarima_model = SARIMAX(store2_item1, order=(0,1,6), seasonal_order=(0,1,1,7))
header_text('SARIMAX Model Fitting')
sarima_results = sarima_model.fit()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.33061D+00    |proj g|=  6.10152D-02

 This problem is unconstrained.

At iterate    5    f=  3.15105D+00    |proj g|=  1.39278D-02

At iterate   10    f=  3.14702D+00    |proj g|=  5.69528D-03

At iterate   15    f=  3.14364D+00    |proj g|=  3.50404D-02

At iterate   20    f=  3.14073D+00    |proj g|=  3.02480D-03

At iterate   25    f=  3.14069D+00    |proj g|=  2.30562D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     29     36      1     0     0   2.485D-05   3.141D+00
  F =   3.1406911338133612     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH


mae = np.mean(np.abs(sarima_results.resid))

pretty(f'{mae:.3f}%', 'Mean Absolute Error for SARIMAX:')


header_text('sarima_results.summary()')
sarima_results.summary()


sarima_results.plot_diagnostics()
plt.tight_layout()


import pmdarima as pm

auto_arima_model = pm.auto_arima(store2_item1,
                      seasonal=True,
                      m=7, 
                      d=1, 
                      D=1, 
                 	  max_p=6,
                      max_q=6,
                      max_P=6,
                      max_Q=6,
                      information_criterion='aic',
                      trace=True, 
                      error_action='ignore',
                      stepwise=True,
                      suppress_warnings=True)

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,1,1)[7]             : AIC=inf, Time=10.86 sec
 ARIMA(0,1,0)(0,1,0)[7]             : AIC=13739.527, Time=0.06 sec
 ARIMA(1,1,0)(1,1,0)[7]             : AIC=12650.572, Time=0.28 sec
 ARIMA(0,1,1)(0,1,1)[7]             : AIC=inf, Time=3.70 sec
 ARIMA(1,1,0)(0,1,0)[7]             : AIC=13229.902, Time=0.10 sec
 ARIMA(1,1,0)(2,1,0)[7]             : AIC=12480.179, Time=0.58 sec
 ARIMA(1,1,0)(3,1,0)[7]             : AIC=12344.238, Time=1.17 sec
 ARIMA(1,1,0)(4,1,0)[7]             : AIC=12270.400, Time=9.19 sec
 ARIMA(1,1,0)(5,1,0)[7]             : AIC=12224.430, Time=16.30 sec
 ARIMA(1,1,0)(6,1,0)[7]             : AIC=12196.670, Time=21.66 sec
 ARIMA(1,1,0)(6,1,1)[7]             : AIC=inf, Time=258.01 sec
 ARIMA(1,1,0)(5,1,1)[7]             : AIC=inf, Time=209.87 sec
 ARIMA(0,1,0)(6,1,0)[7]             : AIC=12702.277, Time=12.74 sec
 ARIMA(2,1,0)(6,1,0)[7]             : AIC=11998.566, Time=15.74 sec
 ARIMA(2,1,0)(5,1,0)[7]             : AIC=12027.447, Time=13.99 sec
 ARIMA(2,1,0)(6,1,1)[7]             : AIC=inf, Time=199.64 sec
 ARIMA(2,1,0)(5,1,1)[7]             : AIC=inf, Time=176.24 sec
 ARIMA(3,1,0)(6,1,0)[7]             : AIC=11888.959, Time=14.89 sec
 ARIMA(3,1,0)(5,1,0)[7]             : AIC=11915.280, Time=11.35 sec
 ARIMA(3,1,0)(6,1,1)[7]             : AIC=inf, Time=258.11 sec
 ARIMA(3,1,0)(5,1,1)[7]             : AIC=inf, Time=244.30 sec
 ARIMA(4,1,0)(6,1,0)[7]             : AIC=11829.853, Time=15.01 sec
 ARIMA(4,1,0)(5,1,0)[7]             : AIC=11855.759, Time=11.59 sec
 ARIMA(4,1,0)(6,1,1)[7]             : AIC=inf, Time=241.76 sec
 ARIMA(4,1,0)(5,1,1)[7]             : AIC=inf, Time=196.47 sec
 ARIMA(5,1,0)(6,1,0)[7]             : AIC=11787.764, Time=15.72 sec
 ARIMA(5,1,0)(5,1,0)[7]             : AIC=11813.941, Time=12.64 sec
 ARIMA(5,1,0)(6,1,1)[7]             : AIC=inf, Time=1564.05 sec
 ARIMA(5,1,0)(5,1,1)[7]             : AIC=inf, Time=239.66 sec
 ARIMA(6,1,0)(6,1,0)[7]             : AIC=11723.670, Time=22.98 sec
 ARIMA(6,1,0)(5,1,0)[7]             : AIC=11754.653, Time=19.05 sec
 ARIMA(6,1,0)(6,1,1)[7]             : AIC=inf, Time=684.40 sec
 ARIMA(6,1,0)(5,1,1)[7]             : AIC=inf, Time=3178.42 sec
 ARIMA(6,1,1)(6,1,0)[7]             : AIC=11680.726, Time=152.59 sec
 ARIMA(6,1,1)(5,1,0)[7]             : AIC=11700.932, Time=69.91 sec
 ARIMA(6,1,1)(6,1,1)[7]             : AIC=inf, Time=1472.88 sec
 ARIMA(6,1,1)(5,1,1)[7]             : AIC=inf, Time=3274.95 sec
 ARIMA(5,1,1)(6,1,0)[7]             : AIC=11681.051, Time=39.52 sec
 ARIMA(6,1,2)(6,1,0)[7]             : AIC=11681.423, Time=255.22 sec
 ARIMA(5,1,2)(6,1,0)[7]             : AIC=11683.639, Time=4220.90 sec
 ARIMA(6,1,1)(6,1,0)[7] intercept   : AIC=11682.576, Time=9994.42 sec

Best model:  ARIMA(6,1,1)(6,1,0)[7]          
Total fit time: 27161.003 seconds


header_text('Fitting SARIMAX with optimized orders:')
sarima_optimized = SARIMAX(store2_item1, order=(6,1,1), seasonal_order=(6,1,0,7))
sarima_optimized_results = sarima_optimized.fit()

 This problem is unconstrained.

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           14     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.31381D+00    |proj g|=  1.91453D-01

At iterate    5    f=  3.23070D+00    |proj g|=  3.97055D-02

At iterate   10    f=  3.20137D+00    |proj g|=  6.16757D-03

At iterate   15    f=  3.19869D+00    |proj g|=  3.66528D-02

At iterate   20    f=  3.19206D+00    |proj g|=  1.39816D-02

At iterate   25    f=  3.19079D+00    |proj g|=  8.52650D-04

At iterate   30    f=  3.19078D+00    |proj g|=  2.85992D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   14     31     35      1     0     0   7.264D-06   3.191D+00
  F =   3.1907792689051648     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL


mae = np.mean(np.abs(sarima_optimized_results.resid))

pretty(f'{mae:.3f}%', 'Mean Absolute Error for SARIMAX optimized:')
print('MAE: %.3f' % mae)

MAE: 4.788


header_text('Optimized Sarimax Results')
sarima_optimized_results.summary()


sarima_optimized_results.plot_diagnostics()
plt.tight_layout()


header_text('ARIMA Model Results')
results.summary()


header_text('First SARIMAX Model Results')
sarima_results.summary()


header_text('Optimized SARIMAX Model Results')
sarima_optimized_results.summary()


from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape


# help(mean_absolute_error)


# help(mean_absolute_percentage_error)


# help(results.get_prediction)


# help(arima_predictions.predicted_mean)


arima_predictions = results.get_prediction(start=-90, dynamic=True)
arima_mean = arima_predictions.predicted_mean

sarima_og_predictions = sarima_results.get_prediction(start=-90, dynamic=True)
sarima_og_mean = sarima_og_predictions.predicted_mean

sarima_02_predictions = sarima_optimized_results.get_prediction(start=-90, dynamic=True)
sarima_02_mean = sarima_02_predictions.predicted_mean


arima_metrics = [round(mae(store2_item1[-90:], arima_mean),3), 
				 round(mape(store2_item1[-90:], arima_mean),3)]

sarima_og_metrics = [round(mae(store2_item1[-90:], sarima_og_mean),3), 
				     round(mape(store2_item1[-90:], sarima_og_mean),3)]

sarima_02_metrics = [round(mae(store2_item1[-90:], sarima_02_mean),3), 
				     round(mape(store2_item1[-90:], sarima_02_mean),3)]


model_results = pd.DataFrame({'metrics':['MAE','MAPE'],
              'ARIMA(4,1,5)': arima_metrics, 
              'SARIMA(0,1,6)(0,1,1)7': sarima_og_metrics,
              'SARIMA(6,1,1)(6,1,0)7': sarima_02_metrics,
             })


model_results


model_results.to_csv('model_results.csv')


dates = store2_item1.index
plt.figure(figsize=(15,10))
plt.title('Forecasting of All Models', size = 22)
plt.plot(arima_mean.index, arima_mean, label='ARIMA(4,1,5)')
plt.plot(sarima_og_mean.index, sarima_og_mean, label='SARIMA(0,1,6)(0,1,1)7')
plt.plot(sarima_02_mean.index, sarima_02_mean, label='SARIMAX(6,1,1)(6,1,0)7')
plt.plot(store2_item1[-90:], label='observed')
plt.legend(loc = 3);


plt.figure(figsize=(15,5))
plt.title('Actual Values vs ARIMA(4,1,5)', size = 22)
plt.plot(store2_item1[-90:], label='observed');
plt.plot(arima_mean.index, arima_mean, label='ARIMA(4,1,5)');
plt.legend(loc = 3);


plt.figure(figsize=(15,5))
plt.title('Actual Values vs SARIMA(0,1,6)(0,1,1)7', size = 22)
plt.plot(store2_item1[-90:], label='observed')
plt.plot(sarima_og_mean.index, sarima_og_mean, label='SARIMA(0,1,6)(0,1,1)7')
plt.legend(loc = 3);


plt.figure(figsize=(15,5))
plt.title('Actual Values vs SARIMA(6,1,1)(6,1,0)7 (automated selection)', size = 22)
plt.plot(store2_item1[-90:], label='observed')
plt.plot(sarima_02_mean.index, sarima_02_mean, label='SARIMA(6,1,1)(6,1,0)7');
plt.legend(loc = 1);


# help(results.get_forecast)


arima_predictions = results.get_forecast(steps=90)
arima_mean = arima_predictions.predicted_mean

sarima_02_pred = sarima_optimized_results.get_forecast(steps=90)
sarima_02_mean = sarima_02_pred.predicted_mean


dates = store2_item1.index
# Plot mean ARIMA and SARIMA predictions and observed
plt.title("Forecasting Comaprison - ARIMA vs SARIMA", size = 22)
plt.plot(store2_item1['2017':], label='actuals')
plt.plot(arima_mean.index, arima_mean, label='ARIMA(4,1,5)')
plt.plot(sarima_02_mean.index, sarima_02_mean, label='SARIMA(6,1,1)(6,1,0)7')
plt.legend(loc = 'lower center');


import joblib
filename = "../model/store_2_item_28_model.pkl"
joblib.dump(sarima_optimized, 'optimized_sarimax_store2_item1.pkl')

['optimized_sarimax_store2_item1.pkl']


loaded_model = joblib.load('optimized_sarimax_store2_item1.pkl')


header_text('Loaded Model Summary')
loaded_model.fit().summary()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           14     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.19078D+00    |proj g|=  0.00000D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   14      0      1      0     0     0   0.000D+00   3.191D+00
  F =   3.1907792689051648     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL

 This problem is unconstrained.


# store2_item1.to_csv('store2_item1.csv')


from prophet import Prophet


store_df = pd.read_csv('store2_item1.csv', parse_dates = ['Unnamed: 0'])
store_df.columns = ['date', 'sales']
store_df.date = pd.to_datetime(store_df.date)
store_df_dtindex = store_df.set_index('date')


df_overview(store_df_dtindex, 'Walmart Data')


# help(store_df.resample)


store_df_dtindex.plot(title = 'Walmart Store No.2 Item No.1 Data', 
			  color = 'cyan');


store_df_dtindex.resample('M')\
				.sum()\
				.plot(title = 'Walmart Data Resampled Monthly',
					  color = 'cyan');


store_df_dtindex.resample('A')\
				.sum()\
				.plot(title = 'Walmart Data Resampled Annually',
					  color = 'cyan');


decomposition = sm.tsa.seasonal_decompose(store_df_dtindex, 
                                         model = 'additive',
                                         period = 365) 
fig = decomposition.plot()
plt.suptitle('Decomposition of Store Data: period = 365 (1 year)')
plt.tight_layout()


decomposition = sm.tsa.seasonal_decompose(store_df_dtindex.resample('M').sum(), 
                                         model = 'additive',
                                         period = 12) 
fig = decomposition.plot()
plt.suptitle('Decomposition of Store Data: Resampled Monthly')
plt.tight_layout()


store_df.columns = ['ds', 'y']


head_tail_horz(store_df, 5, 'Dataframe Prepared for Prophet')


# help(Prophet)


prophet = Prophet(interval_width=0.95) #by default is 80%
prophet = prophet.fit(store_df)

14:49:09 - cmdstanpy - INFO - Chain [1] start processing
14:49:09 - cmdstanpy - INFO - Chain [1] done processing


help(prophet.make_future_dataframe)

Help on method make_future_dataframe in module prophet.forecaster:

make_future_dataframe(periods, freq='D', include_history=True) method of prophet.forecaster.Prophet instance
    Simulate the trend using the extrapolated generative model.
    
    Parameters
    ----------
    periods: Int number of periods to forecast forward.
    freq: Any valid frequency for pd.date_range, such as 'D' or 'M'.
    include_history: Boolean to include the historical dates in the data
        frame for predictions.
    
    Returns
    -------
    pd.Dataframe that extends forward from the end of self.history for the
    requested number of periods.


future = prophet.make_future_dataframe(periods = 90)


head_tail_horz(future, 5, 'Future Predictions DataFrame')
head_tail_horz(store_df, 5, 'Original DataFrame (90 days shorter)')


forecast = prophet.predict(future)
head_tail_vert(forecast, 5, 'Prophet Forecast Predictions')


help(prophet.plot)

Help on method plot in module prophet.forecaster:

plot(fcst, ax=None, uncertainty=True, plot_cap=True, xlabel='ds', ylabel='y', figsize=(10, 6), include_legend=False) method of prophet.forecaster.Prophet instance
    Plot the Prophet forecast.
    
    Parameters
    ----------
    fcst: pd.DataFrame output of self.predict.
    ax: Optional matplotlib axes on which to plot.
    uncertainty: Optional boolean to plot uncertainty intervals.
    plot_cap: Optional boolean indicating if the capacity should be shown
        in the figure, if available.
    xlabel: Optional label name on X-axis
    ylabel: Optional label name on Y-axis
    figsize: Optional tuple width, height in inches.
    include_legend: Optional boolean to add legend to the plot.
    
    Returns
    -------
    A matplotlib figure.


header_text('Prophet Forecast Visualization')
plt.style.use('ggplot')
forecast_plot = prophet.plot(forecast)
plt.tick_params(color = 'black')


header_text('Prophet Forecast with Changepoints')

from prophet.plot import add_changepoints_to_plot

plot = prophet.plot(forecast)
changepoints = add_changepoints_to_plot(plot.gca(), prophet, forecast)


plot = prophet.plot_components(forecast)


prophet_data = pd.merge(store_df, forecast[['ds','yhat_lower','yhat_upper','yhat']],
					    on = 'ds')
prophet_data = prophet_data[['ds','yhat_lower','yhat_upper','yhat','y']]


df_overview(prophet_data, 'Prophet Forecast Data')


actuals = prophet_data['y'].values
predictions = prophet_data['yhat'].values
prophet_mae = mae(actuals, predictions)

pretty(f'{prophet_mae:.3f}%', 'Prophet MAE Score:')


prophet_mape = mape(actuals, predictions)
pretty(f'{prophet_mape:.3f}%', 'Prophet MAPE Score:')


plt.style.use('strawberries.mplstyle')
plt.plot(actuals, label='Actual Values')
plt.plot(predictions, label='Predicted Values')
plt.legend();
plt.title('Prophet Actual Values vs Predicted Values');


%%capture

from prophet.diagnostics import cross_validation

cross_val = cross_validation(prophet, horizon = '90 days')
cross_val_df = cross_validation(prophet, initial='270 days', 
								period='45 days', 
								horizon = '90 days')


head_tail_vert(cross_val_df, 5, 'Prophet Cross Validation Data')


from prophet.diagnostics import performance_metrics

perf_metrics = performance_metrics(cross_val_df)


df_overview(perf_metrics, 'Prophet Performance Metrics')


header_text('Prophet Cross Validation MAE Scores')
plt.style.use('classic')
from prophet.plot import plot_cross_validation_metric
plot = plot_cross_validation_metric(cross_val_df, metric='mae')


header_text('Prophet Cross Validation MAPE Scores')
plot = plot_cross_validation_metric(cross_val_df, metric='mape')


def prophet_hypertune(df):
	%%capture 

	import itertools

	param_grid = {  
		'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
		'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],   
	}

	# Generate all combinations of parameters
	all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
	maes = []  # Store the MAE for each params here
	mapes = [] # Store the MAPE for each params here

	# Use cross validation to evaluate all parameters
	for params in all_params:
		m = Prophet(**params).fit(df)  # Fit model with given params
		df_cv = cross_validation(m, horizon='90 days', parallel="processes")
		df_p = performance_metrics(df_cv, rolling_window=1)
		maes.append(df_p['mae'].values[0])
		mapes.append(df_p['mape'].values[0])

	# Find the best parameters
	tuning_results = pd.DataFrame(all_params)
	tuning_results['mae'] = maes
	tuning_results['mape'] = mapes

	return tuning_results


# tuning_results = prophet_hypertune(store_df)
# tuning_results.to_csv('prophet_tuning_results.csv')


tuning_results = pd.read_csv('prophet_tuning_results.csv')


header_text('Prophet Hyperparameter Tuning Results')
tuning_results.sort_values(['mae', 'mape'])


best_params = all_params[np.argmin(mapes)]
pretty(best_params, 'Best hyperparameters:')


model = Prophet(interval_width=0.95, weekly_seasonality=True, 
            changepoint_prior_scale=best_params['changepoint_prior_scale'], 
            seasonality_prior_scale=best_params['seasonality_prior_scale'])

prophet = model.fit(store_df)

17:36:48 - cmdstanpy - INFO - Chain [1] start processing
17:36:49 - cmdstanpy - INFO - Chain [1] done processing


future = prophet.make_future_dataframe(periods=90)
forecast = prophet.predict(future)


header_text('Optimized Prophet Forecasting Results')
plot = prophet.plot(forecast)


header_text('Optimized Prophet Components')
plot = prophet.plot_components(forecast)


opt_prophet_results = pd.merge(store_df, 
					forecast[['ds','yhat_lower','yhat_upper','yhat']],
					on='ds')

opt_prophet_results = opt_prophet_results[['ds','yhat_lower','yhat_upper','yhat','y']]


actuals = opt_prophet_results['y'].values
predictions = opt_prophet_results['yhat'].values
mae_02 = mae(actuals, predictions)
pretty(f'{mae_02:.3f}%', 'Optimized Prophet MAE')


mape_02 = mape(actuals, predictions)
pretty(f'{mape_02:.3f}%', 'Optimized Prophet MAPE')


plt.style.use('strawberries.mplstyle')
plt.plot(actuals, label='Actual Values')
plt.plot(predictions, label='Predicted Values')
plt.title('Optimized Prophet Accuracy')
plt.legend(loc = 2);


# %%capture
# # df_cv = cross_validation(m, horizon='90 days')
# cross_val_df = cross_validation(m, initial='270 days', period='45 days', horizon = '90 days')


perf_metrics = performance_metrics(cross_val_df)


head_tail_vert(perf_metrics, 5, 'Optimized Prophet Performance')


header_text('Optimized Prophet MAPE Scores')
plt.style.use('classic')
plot = plot_cross_validation_metric(cross_val_df, metric='mape')


metrics_prophet_01 = [round(prophet_mae,3), round(prophet_mape,3)]
metrics_prophet_02 = [round(mae_02,3), round(mape_02,3)]

pd.DataFrame({'metrics':['MAE','MAPE'],
              'Prophet_01':metrics_prophet_01, 
              'Prophet_02':metrics_prophet_02,
             })


model = Prophet(interval_width=0.95, weekly_seasonality=True, 
            changepoint_prior_scale=best_params['changepoint_prior_scale'], 
            seasonality_prior_scale=best_params['seasonality_prior_scale'])

model.add_country_holidays(country_name='US')
prophet_03 = model.fit(store_df)

18:10:16 - cmdstanpy - INFO - Chain [1] start processing
18:10:16 - cmdstanpy - INFO - Chain [1] done processing


pretty('US Holidays from Prophet')
pd.DataFrame(model.train_holiday_names).style\
									   .hide(axis = 'index')\
									   .hide(axis = 'columns')


future = prophet_03.make_future_dataframe(periods=90)
forecast = prophet_03.predict(future)


header_text('Prophet Model Predictions, holidays incl.')
plt.style.use('classic')
plot = prophet_03.plot(forecast)


header_text('Prophet Model Components, holidays incl.')
plot = prophet_03.plot_components(forecast)


prophet_03_results = pd.merge(store_df, 
						forecast[['ds','yhat_lower','yhat_upper','yhat']],on='ds')

prophet_03_results = prophet_03_results[['ds','yhat_lower','yhat_upper','yhat','y']]


# calculate MAE between expected and predicted values for december
actuals = prophet_03_results['y'].values
predictions = prophet_03_results['yhat'].values
mae_03 = mae(actuals, predictions)
pretty(f'{mae_03:.3f}', 'Prophet MAE, holidays incl.')


mape_03 = mape(actuals, predictions)
pretty(f'{mape_03:.3f}', 'Prophet MAPE, holidays incl.')


plt.style.use('strawberries.mplstyle')
plt.plot(actuals, label='Actual Values')
plt.plot(predictions, label='Predicted Values')
plt.legend();


cross_val_df = cross_validation(prophet_03, initial='270 days', period='45 days', horizon = '90 days')


metrics_prophet_03 = [round(mae_03,3), round(mape_03,3)]


pd.DataFrame({'metrics':['MAE','MAPE'],
              'Prophet_01':metrics_prophet_01, 
              'Prophet_02':metrics_prophet_02,
              'Prophet_03':metrics_prophet_03,
             })


perf_metrics = performance_metrics(cross_val_df)


head_tail_vert(perf_metrics, 5, 'Prophet Performance, holidays incl.')


plt.style.use('classic')
plot = plot_cross_validation_metric(cross_val_df, metric='mape')


import json
from prophet.serialize import model_to_json, model_from_json


with open('prophet_model.json', 'w') as model_out:
    json.dump(model_to_json(prophet_03), model_out)


with open('prophet_model.json', 'r') as model_in:
    prophet = model_from_json(json.load(model_in))


sarimax_results = pd.read_csv("model_results.csv")

prophet_results = pd.DataFrame({'metrics':['MAE','MAPE'],
              'Prophet_01':metrics_prophet_01, 
              'Prophet_02':metrics_prophet_02,
              'Prophet_03':metrics_prophet_03,
             })

sarimax_results.merge(prophet_results, on='metrics').drop(columns = ['Unnamed: 0'])

Time Series Components
- trend shows whether the series is consistently decreasing (downward trend), constant (no trend) or increasing (upward trend) over time - seasonality describes the periodic signal in your time series - noise or residual is the unexplained variance and volatility of the time series
- Python’s statsmodels library: seasonal_decompose
- Seasonality: a pattern that occurs in a fixed and known period - Cylicality: a pattern that does not have a fixed or known period - Stationarity: when data's statistical properties do. not change over time. - With algorithms like SARIMAX, it is important to identify this property, because they depend on it - with linear regression, it is assumed that observations are independent of each other - in a time series, observations are time dependent - by making the time series stationary it is possible to apply regression techniques to time dependent variables - non-stationary time series can be made stationary
Stationary Time Series Criteria - the variance in the seasonality component is constant - the amplitude of the signal does not change much over time - autocorrelation is constant - the relationship of each value in the time series and its neighbors stays the same - analyzing components is a common way to check for stationarity - Augmented Dicky-Fuller test (ADF) - Kwiatkowski-Phillips-Schmidt-Shin test (KPSS) - these are part of the Python statsmodel library

Additive Models and Multiplicative Models
Time series trend, seasonal, and residual components can occur in either an additive or mutliplicative way.
Additive Models: - the components are added linearly - changes over time are consistent in the amount they change $$Y(t) = trend + seasonality + residual$$- linear trend is a straight line - linear seasonality has the same frequency and amplitude, width and height of cycles
Multiplicative Models: - components are multiplied with one another $$Y(t) = trend * seasonality * residual$$ - nonlinear, i.e. it is quadratic or exponential - changes increase or decrease over time - trend is a curved, non-linear line - non-linear seasonality which varies in frequency and amplitude
Decomposition Models - a main objective of decomposition is to estimate seasonal effects - these can be used to create seasonally adjusted values - additive models are useful when seasonal variation is fairly constant over time - multiplicative models are useful when seasonal variation increases over time.

	diet: (United States)
datatype	int64
missing values	0
count	261.00
mean	56.35
std	11.50
min	30.00
25%	49.00
50%	55.00
75%	64.00
max	100.00

total rows	261
total columns	1
column names	diet: (United States)
index start	2016-03-27 00:00:00
index end	2021-03-21 00:00:00
total missing values	0

Stationarity Tests
statsmodels documentation on ADF and KPSS tests Augmented Dicky-Fuller test (ADF) stationarity test can allow data to pass that may not actually be stationary - it is best to also apply the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) to check for true stationarity - it is also important to observe a time series' plot
ADF Test: - tests for the presence of unit root in the series - helps determine if series is stationary In statistics, an augmented Dickey–Fuller test (ADF) tests the null hypothesis that a unit root is present in a time series sample. The alternative hypothesis is different depending on which version of the test is used, but is usually stationarity or trend-stationarity. It is an augmented version of the Dickey–Fuller test for a larger and more complicated set of time series models. The augmented Dickey–Fuller (ADF) statistic, used in the test, is a negative number. The more negative it is, the stronger the rejection of the hypothesis that there is a unit root at some level of confidence.(Source) Unit Root In probability theory and statistics, a unit root is a feature of some stochastic processes (such as random walks) that can cause problems in statistical inference involving time series models. A linear stochastic process has a unit root if 1 is a root of the process's characteristic equation. Such a process is non-stationary but does not always have a trend.(Source)
- Null Hypothesis: The series has a unit root, meaning it is non-stationary. It has some time dependent structure. - Alternate Hypothesis: The series has no unit root, meaning it is stationary. It does not have time-dependent structure. - if null hypothesis is not rejected, possible evidence a series is non-stationary - p-value below a threshold (1%, 5%, etc) suggests null hypothesis, i.e. stationary - p-value above the threshold suggest failed null hypothesis, i.e. non-stationary
KPSS Text - null and alternate hypothesis are opposite those of ADF - Null Hypothesis - the time series is trend stationary - Alternate Hypothesis - the seies has a unit root and is not stationary - a p-value below a threshold suggests rejection of the null hypothesis and non-stationarity - a p-value above the threshold suggests failure to reject null hypothesis, i.e. stationary

Times Series with Sarimax & Prophet
This notebook contains my notes from the tutorial source linked below. I have personalized the code in many places to match my own way of doing things, and I have added my own explanations of things for my own study.
\| Source Author \| Source Repository \|

Results:
ADF -> 0.05 threshold - the p-value is not below the threshold, null hypothesis is regected. Series is stationary KPSS -> evidence suggests rejecting the null hypothesis for the alternate hypothesis, thus suggesting non-stationarity - these results fall into the category of the last in our list, thus it is necessary to apply difference to achieve stationarity - the series must then be tested again
Summary: - the trend is non-linear and multiplicative, neither increasing or decreasing consistently - seasonality is influenced at the end and beginning of each year - seasonality is linear and does not vary in frequency or amplitude - additive residuals are lower than multiplicative - ADF says stationary, while KPSS says non-stationary, so difference must be applied
Conclusion: - since seasonality is linear and the smaller additive residuals, it is reasonable to choose the additive model as the more appropriate

	d	adf_stats	p-value	is_adf_stationary	is_kpss_stationary	is_stationary
0	0	4.992622e-02	0.019878	True	False	False
1	1	1.171221e-13	0.100000	True	True	True
2	2	2.761322e-12	0.100000	True	True	True

	Source	Mean
datatype	object	float64
missing values	0	0
count	nan	3,288.00
mean	nan	0.04
std	nan	0.34
min	nan	-0.78
25%	nan	-0.21
50%	nan	-0.04
75%	nan	0.24
max	nan	1.35

total rows	3,288
total columns	2
column names	Source, Mean
index start	1880-01-06 00:00:00
index end	2016-12-06 00:00:00
total missing values	0

	Source	Mean
Date
1880-01-06	GISTEMP	-0.30
1880-01-06	GCAG	0.00
1880-02-06	GCAG	-0.12
1880-02-06	GISTEMP	-0.21

	Source	Mean
Date
2016-11-06	GISTEMP	0.93
2016-11-06	GCAG	0.75
2016-12-06	GISTEMP	0.81
2016-12-06	GCAG	0.79

total rows	1,644
total columns	1
column names	giss_mean
index start	1880-01-06 00:00:00
index end	2016-12-06 00:00:00
total missing values	0

	d	adf_stats	p-value	is_adf_stationary	is_kpss_stationary	is_stationary
0	0	9.164152e-01	0.01	False	False	False
1	1	9.464601e-23	0.10	True	True	True
2	2	2.255372e-29	0.10	True	True	True

	d	adf_stats	p-value	is_adf_stationary	is_kpss_stationary	is_stationary
0	0	8.921048e-01	0.01	False	False	False
1	1	7.940293e-22	0.10	True	True	True
2	2	4.754621e-29	0.10	True	True	True

(S)ARIMA(X) models
ARIMA - Auto Regressive Moving Average - combination of two models, auto regressive, which uses lagged values to forecast - and moving average, which uses lagged values of residual errors to forecast - uses dependencies between data values and error values from past data to make predictions - takes three parameters * p - number of autoregressive lags * d - number of times differencing is applied to make the data stationary * q - number of moving average lags - it is also possible to apply transformations before using the ARIMA model - However, if differencing and other transformations are applied before the model, they must be reverse transformed to access the forecast of the original values - it is important to difference the data ONLY until it is stationary and no further - to know the value for d, the number of times to run differencing, use the ADF and KPSS tests, the *adf_kpss()* function above
ARIMAX - extended version of the ARIMA model which encorporates exogenous inputs - modeled using other independednt variables in addition to the time series - example: when modeling the waiting time in an emergency room. The number of nurses available at a certain shift could be considered an external variable since it may impact on the waiting time. If this is indeed the case, by changing the number of nurses we can affect the waiting times.
SARIMA - this model should be used when there is seasonality - ARIMA ignores seasonality - SARIMA includes additional parameters to work with seasonality: P, D, Q, and S * P - seasonal autoregressive order * D - seasonal differencing order * Q - seasonal moving average order * S - length of the seasonal cycle

	date	store	item	sales
datatype	object	int8	int8	int16
missing values	0	0	0	0
count	nan	913,000.00	913,000.00	913,000.00
mean	nan	5.50	25.50	52.25
std	nan	2.87	14.43	28.80
min	nan	1.00	1.00	0.00
25%	nan	3.00	13.00	30.00
50%	nan	5.50	25.50	47.00
75%	nan	8.00	38.00	70.00
max	nan	10.00	50.00	231.00

total rows	913,000
total columns	4
column names	date, store, item, sales
index start	0
index end	912999
total missing values	0

	date	store	item	sales
912997	2017-12-29	10	50	74
912998	2017-12-30	10	50	62
912999	2017-12-31	10	50	82

	id	date	store	item
datatype	int32	object	int8	int8
missing values	0	0	0	0
count	45,000.00	nan	45,000.00	45,000.00
mean	22,499.50	nan	5.50	25.50
std	12,990.53	nan	2.87	14.43
min	0.00	nan	1.00	1.00
25%	11,249.75	nan	3.00	13.00
50%	22,499.50	nan	5.50	25.50
75%	33,749.25	nan	8.00	38.00
max	44,999.00	nan	10.00	50.00

total rows	45,000
total columns	4
column names	id, date, store, item
index start	0
index end	44999
total missing values	0

	id	date	store	item
44997	44,997	2018-03-29	10	50
44998	44,998	2018-03-30	10	50
44999	44,999	2018-03-31	10	50

	sales
store
2	6120128
8	5856169
3	5435144
10	5360158
9	5025976
4	5012639
1	4315603
5	3631016
6	3627670
7	3320009

	date	item	sales
1826	2013-01-01	1	12
1827	2013-01-02	1	16
1828	2013-01-03	1	16
1829	2013-01-04	1	20
1830	2013-01-05	1	16

	date	item	sales	year	month	day	weekday	year-month
878884	2014-08-02	49	49	2014	8	214	5	2014-8
203315	2014-09-22	12	90	2014	9	265	0	2014-9
824605	2015-12-16	46	55	2015	12	350	2	2015-12
257504	2013-02-08	15	67	2013	2	39	4	2013-2
112175	2015-03-01	7	87	2015	3	60	6	2015-3

The Box-Jenkins Method
- Helps in choosing parameters that will lead to a good model. (Source) The original model uses an iterative three-stage modeling approach: 1. Model identification and model selection: making sure that the variables are stationary, identifying seasonality in the dependent series (seasonally differencing it if necessary), and using plots of the autocorrelation (ACF) and partial autocorrelation (PACF) functions of the dependent time series to decide which (if any) autoregressive or moving average component should be used in the model. 2. Parameter estimation using computation algorithms to arrive at coefficients that best fit the selected ARIMA model. The most common methods use maximum likelihood estimation or non-linear least-squares estimation. 3. Statistical model checking by testing whether the estimated model conforms to the specifications of a stationary univariate process. In particular, the residuals should be independent of each other and constant in mean and variance over time. (Plotting the mean and variance of residuals over time and performing a Ljung–Box test or plotting autocorrelation and partial autocorrelation of the residuals are helpful to identify misspecification.) If the estimation is inadequate, we have to return to step one and attempt to build a better model. Image Source and Author
- the Walmart dataset contains 500 times series which are paired stores and items sold (10 stores, 50 items) - each of the time series will need a forecast model applied to it in order to forecast sales for all stores

Box-Jenkins Step One: Identify the Model
- Identifying the characteristics of a time series in order to choose the appropriate model Questions to ask: 1. Is the time series stationary? 2. If it is not stationary, which transformation is best to make it stationary? 3. Is the time series seasonal? 4. If seasonal, what is the periodicity of its seasonality? 5. Which orders should be used? (p for Arima, q for Arimax)
- 500 time series, each being a store-item pair - a forecast model must be applied to all 500 time series pairs - the following is the method applied to one time series, the pair: store no. 2 and item 1

	sales
date
2017-12-27	19
2017-12-28	21
2017-12-29	18
2017-12-30	24
2017-12-31	31

	d	adf_stats	p-value	is_adf_stationary	is_kpss_stationary	is_stationary
0	0	3.804258e-02	0.01	True	False	False
1	1	8.084986e-22	0.10	True	True	True
2	2	1.151984e-27	0.10	True	True	True

Box-Jenkins Step Two: Estimate Coefficients (p, q)
- althought SARIMA is the better choice, applying ARIMA first for comparison (i.e. SARIMAX with no seasonality settings? Not sure, because she uses SARIMAX first as well.) - this will show some of the advantages of choosing the appropriate model
- to choose proper parameters, there is some trial and error - will use the ARIMA model with different values - choosing best values based on metrics like AIC and BIC AIC - Akaike Information Criterion - tells how good a model is - lower value means a better model - penalizes models wwith many parameters - i.e. if order is too high compared to the data, there will be a high score - this indicates where work should be done to avoid overfitting to the training data BIC - Bayesian Information Criterion - similar to AIC in that lower value is better - penalizes additional model orders more than AIC does - consequently BIC will sometimes suggest a simpler model - these statistics can be obtained after fitting a model - there is usually some agreement between the two metrics - if there is no agreement, it is best to choose a smaller AIC for a predictive model

	p	q	AIC	BIC
33	4	5	11,664.36	11,719.46
40	5	5	11,673.12	11,733.72
34	4	6	11,682.70	11,743.30
48	6	6	11,708.12	11,779.74
26	3	5	11,717.15	11,766.73

Box-Jenkins Step Three: Model Evaluation
- evaluating the accuracy of the model before choosing it as the best - focusing on residuals for evaluation - residuals are the difference between the model's one-step-ahead predictions and the real values of the time series
Mean Absolute Error (MAE) - calculating the MAE of the residuals - this will show on average how far off the predictions are from the true values

total rows	1,826
total columns	1
column names	sales
index start	2013-01-01 00:00:00
index end	2017-12-31 00:00:00
total missing values	0

Conclusions:
- an ideal model will have residuals resembling uncorrelating white Gaussian noise centered on zero - this concept will be evaluated using the plots above
Standardized Residual: - there are no obvious patterns in the residuals - this suggests a good model Histogram and KDE Estimate: - shows the measured distribution of the residuals - green line shows the KDE curve, a smoothed version of the histogram - the line shows a normal distribution - for a good model, the N line will be similar to the KDE line Correlogram: - 95% of correlations for lag greater than one should not be significant (inside the blue area) - indicates a good model Normal Q-Q: - most of the data points occur on the line - this indicates a normal distribution of the residuals
The conclusion is that these metrics all point to this being a good model.
- if residuals are not normally distributed, increasing d can fix - if the residuals are correlated, increasing p and / or q can help fix

Dep. Variable:	sales	No. Observations:	1826
Model:	SARIMAX(0, 1, 6)x(0, 1, [1], 7)	Log Likelihood	-5734.902
Date:	Tue, 07 Feb 2023	AIC	11485.804
Time:	11:45:43	BIC	11529.848
Sample:	01-01-2013	HQIC	11502.054
	- 12-31-2017
Covariance Type:	opg