import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


df = pd.read_csv('SPY_Data.csv')


df["Volume"] = df["Volume"].str.replace(',', '').astype(int)
df['Date'] = pd.to_datetime(df['Date'])


df['High-Low'] = df['High'] - df['Low']
df['Close-Open'] = df['Close'] - df['Open']


df['pct_change_high'] = df['High'].pct_change()
df['pct_change_low'] = df['Low'].pct_change()
df['pct_change_close'] = df['Close'].pct_change()
df['pct_change_open'] = df['Open'].pct_change()

df


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["Open"],  label='Open', color='blue')
plt.plot(df["Date"], df["High"],  label='High', color='green')
plt.plot(df["Date"], df["Low"], label='Low', color='red')
plt.plot(df["Date"], df["Close"], label='Close', color='purple')

plt.title("Stock Prices Over Time")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["Open"],  label='Open', color='blue')

plt.title("Opening Prices Over Time")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["High"],  label='High', color='green')

plt.title("High (Daily Maximum) Prices Over Time")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["Low"], label='Low', color='red')

plt.title("Low (Daily Minimum) Prices Over Time")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["Close"], label='Close', color='purple')

plt.title("Closing Prices Over Time")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


# Line Plot of Open, High, Low, and Close Prices
plt.figure(figsize=(20, 10))
plt.plot(df["Date"], df["Volume"], label='Volume', color='brown')

plt.title("Volume Over Time")
plt.xlabel("Date")
plt.ylabel("Volume")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              753 non-null    datetime64[ns]
 1   Open              753 non-null    float64       
 2   High              753 non-null    float64       
 3   Low               753 non-null    float64       
 4   Close             753 non-null    float64       
 5   Volume            753 non-null    int64         
 6   High-Low          753 non-null    float64       
 7   Close-Open        753 non-null    float64       
 8   pct_change_high   752 non-null    float64       
 9   pct_change_low    752 non-null    float64       
 10  pct_change_close  752 non-null    float64       
 11  pct_change_open   752 non-null    float64       
dtypes: datetime64[ns](1), float64(10), int64(1)
memory usage: 70.7 KB


df.head()


df.tail()


df.describe()


df.count()


df.dtypes


t_stat, p_value = stats.ttest_rel(df['Open'], df['Close'])
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

# Create a box plot for Open and Close prices
plt.figure(figsize=(8, 6))
sns.boxplot(data=df[['Open', 'Close']], palette="pastel")
plt.title("Comparison of Open and Close Prices")
plt.ylabel("Price")
plt.show()

T-statistic: -0.675
P-value: 0.500


# ANOVA Testing

df_2021 = df[df['Date'].dt.year == 2021]
df_2022 = df[df['Date'].dt.year == 2022]
df_2023 = df[df['Date'].dt.year == 2023]
df_2024 = df[df['Date'].dt.year == 2024]

f_stat, p_val = stats.f_oneway(df_2021['Volume'], df_2022['Volume'], df_2023['Volume'], df_2024['Volume'])
print(f"F-statistic: {f_stat:.3f}")
print(f"P-value: {p_val:3f}")

# Creating a bar chart of the volume means based on year

df['Year'] = df['Date'].dt.year
volume_means = df.groupby('Year')['Volume'].mean()

volume_means.plot(kind = 'bar', figsize = (10, 5), color = ['forestgreen', 'purple',
                                                            'lightblue', 'lightcoral'])
plt.title('Mean Volume by Year')
plt.xlabel('Year')
plt.ylabel('Mean Volume (In 10s of Millions)')
plt.show()

df = df.drop(columns = ['Year'])

F-statistic: 61.896
P-value: 0.000000


# ANOVA Testing

price_bins = pd.qcut(df['Close'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])
df['Close_Category'] = price_bins
volume_means_by_price = df.groupby('Close_Category')['Volume'].mean()

groups = [df[df['Close_Category'] == category]['Volume'] for category in df['Close_Category'].unique()]
f_stat, p_val = stats.f_oneway(*groups)

print(f"F-statistic: {f_stat:.3f}")
print(f"P-value: {p_val:.6f}")

# Creating a bar chart of the volume means based on low, medium, high, and very high close prices

volume_means_by_price.plot(kind='bar', figsize=(10, 5), color=['skyblue', 'orange', 'green', 'red'])
plt.title('Mean Volume by Close Price Category')
plt.xlabel('Close Price Category')
plt.ylabel('Mean Volume (In 10s of Millions)')
plt.xticks(rotation=0)
plt.show()

df = df.drop(columns=['Close_Category'])

<ipython-input-67-9438d9912b6d>:5: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  volume_means_by_price = df.groupby('Close_Category')['Volume'].mean()

F-statistic: 55.397
P-value: 0.000000


df[(np.abs(stats.zscore(df['Open'])) > 3)]


df[(np.abs(stats.zscore(df['Close'])) > 3)]


df[(np.abs(stats.zscore(df['High'])) > 3)]


df[(np.abs(stats.zscore(df['Low'])) > 3)]


df[(np.abs(stats.zscore(df['Volume'])) > 3)]


plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['Open', 'High', 'Low', 'Close']], palette="Set2")

plt.title("Box Plot of Open, High, Low, and Close Prices (Outlier Detection)")
plt.ylabel("Price ($)")
plt.show()


plt.figure(figsize=(10, 6))
sns.boxplot(df['Volume'], color="skyblue")

plt.title("Box Plot of Volume (Outlier Detection)")
plt.xlabel("Volume (In 10s of Millions)")
plt.show()


correlation_matrix = df.corr()
correlation_matrix


plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, square=True, linewidths=0.5)
plt.title("Correlation Matrix of Stock Prices")
plt.show()


percent_split = 0.8
target_column = 'Next-Close-Open'
df[target_column] = df['Close-Open'].shift(1)
train_size = int(len(df) * percent_split)

new_df = df.drop(columns=["Date"]).dropna()

train_df = new_df.iloc[:train_size]
test_df = new_df.iloc[train_size:]

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]


clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)


test_predictions = clf.predict(X_test)


mse = mean_squared_error(y_test, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, test_predictions)
r2 = r2_score(y_test, test_predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

print(f"Mean of y_test: {np.mean(y_test)}")
print(f"Standard deviation of y_test: {np.std(y_test)}")
print(f"Mean of test_predictions: {np.mean(test_predictions)}")
print(f"Standard deviation of test_predictions: {np.std(test_predictions)}")

Mean Squared Error (MSE): 9.622851333333337
Root Mean Squared Error (RMSE): 3.1020721031809266
Mean Absolute Error (MAE): 2.186599999999997
R-squared (R2): 0.6053888789800386
Mean of y_test: -0.16113333333333307
Standard deviation of y_test: 4.938183544133973
Mean of test_predictions: -0.45519999999999605
Standard deviation of test_predictions: 4.513693198848742


print("Columns:")
for idx, col in enumerate(X_train.columns):
    print(f"{idx}: {col}")

plt.figure(figsize=(20, 10))
tree.plot_tree(clf, max_depth=4, fontsize = 8)
plt.show()

Columns:
0: Open
1: High
2: Low
3: Close
4: Volume
5: High-Low
6: Close-Open
7: pct_change_high
8: pct_change_low
9: pct_change_close
10: pct_change_open


y_test_array = np.asarray(y_test).astype('float32')
test_predictions_array = np.asarray(test_predictions).astype('float32')

plt.figure(figsize=(10, 6))
plt.plot(y_test_array, label='Actual Values', color='blue', linewidth=1.5)
plt.plot(test_predictions_array, label='Predicted Values', color='red', linestyle='dashed', linewidth=1.5)
plt.title('Actual vs Predicted Values (Line)')
plt.xlabel('Days')
plt.ylabel('Value [$]')
plt.legend()
plt.show()


difference = y_test_array - test_predictions_array.squeeze()
colors = ['green' if diff > 0 else 'red' for diff in difference]
plt.figure(figsize=(10, 6))
plt.bar(range(len(difference)), difference, color=colors, alpha=0.6)
plt.title('Difference Between Actual and Predicted Values')
plt.xlabel('Days')
plt.ylabel('Difference [$]')
plt.show()


# Scatter Plot: Predicted vs Actual
plt.figure(figsize=(8, 8))
plt.scatter(y_test_array, test_predictions_array, alpha=0.6, color='green')
plt.plot([y_test_array.min(), y_test_array.max()], [y_test_array.min(), y_test_array.max()], color='orange', linestyle='--', linewidth=1.5)
plt.title('Predicted vs Actual Values')
plt.xlabel('Actual Values [$]')
plt.ylabel('Predicted Values [$]')
plt.show()

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
0	2024-10-01	573.40	574.06	566.00	568.62	72668781	8.06	-4.78	NaN	NaN	NaN	NaN
1	2024-09-30	570.42	574.38	568.08	573.76	63655449	6.30	3.34	0.000557	0.003675	0.009039	-0.005197
2	2024-09-27	573.39	574.22	570.42	571.47	42100930	3.80	-1.92	-0.000279	0.004119	-0.003991	0.005207
3	2024-09-26	574.38	574.71	569.90	572.30	48336000	4.81	-2.08	0.000853	-0.000912	0.001452	0.001727
4	2024-09-25	571.14	571.89	568.91	570.04	38428594	2.98	-1.10	-0.004907	-0.001737	-0.003949	-0.005641
...	...	...	...	...	...	...	...	...	...	...	...	...
748	2021-10-08	439.48	439.89	437.19	437.86	74557398	2.70	-1.62	-0.000840	0.005913	0.007293	0.005307
749	2021-10-07	438.39	441.68	438.20	438.66	72437508	3.48	0.27	0.004069	0.002310	0.001827	-0.002480
750	2021-10-06	429.27	435.12	427.54	434.90	113032203	7.58	5.63	-0.014852	-0.024327	-0.008572	-0.020803
751	2021-10-05	430.24	435.49	429.39	433.10	90682523	6.10	2.86	0.000850	0.004327	-0.004139	0.002260
752	2021-10-04	433.00	433.96	426.36	428.64	128570000	7.60	-4.36	-0.003513	-0.007057	-0.010298	0.006415

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
0	2024-10-01	573.40	574.06	566.00	568.62	72668781	8.06	-4.78	NaN	NaN	NaN	NaN
1	2024-09-30	570.42	574.38	568.08	573.76	63655449	6.30	3.34	0.000557	0.003675	0.009039	-0.005197
2	2024-09-27	573.39	574.22	570.42	571.47	42100930	3.80	-1.92	-0.000279	0.004119	-0.003991	0.005207
3	2024-09-26	574.38	574.71	569.90	572.30	48336000	4.81	-2.08	0.000853	-0.000912	0.001452	0.001727
4	2024-09-25	571.14	571.89	568.91	570.04	38428594	2.98	-1.10	-0.004907	-0.001737	-0.003949	-0.005641

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
748	2021-10-08	439.48	439.89	437.19	437.86	74557398	2.70	-1.62	-0.000840	0.005913	0.007293	0.005307
749	2021-10-07	438.39	441.68	438.20	438.66	72437508	3.48	0.27	0.004069	0.002310	0.001827	-0.002480
750	2021-10-06	429.27	435.12	427.54	434.90	113032203	7.58	5.63	-0.014852	-0.024327	-0.008572	-0.020803
751	2021-10-05	430.24	435.49	429.39	433.10	90682523	6.10	2.86	0.000850	0.004327	-0.004139	0.002260
752	2021-10-04	433.00	433.96	426.36	428.64	128570000	7.60	-4.36	-0.003513	-0.007057	-0.010298	0.006415

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
count	753	753.000000	753.000000	753.000000	753.000000	7.530000e+02	753.000000	753.000000	752.000000	752.000000	752.000000	752.000000
mean	2023-04-02 12:18:10.039841024	448.402537	451.117703	445.519097	448.498539	8.037530e+07	5.598606	0.096003	-0.000328	-0.000326	-0.000314	-0.000311
min	2021-10-04 00:00:00	349.210000	359.820000	348.110000	356.560000	2.731413e+07	0.770000	-14.080000	-0.035186	-0.040247	-0.052092	-0.052476
25%	2022-07-05 00:00:00	407.580000	410.490000	405.020000	408.020000	6.174603e+07	3.420000	-2.020000	-0.005544	-0.006044	-0.006973	-0.007001
50%	2023-04-03 00:00:00	439.480000	442.970000	437.220000	439.640000	7.575710e+07	4.860000	0.290000	-0.001158	-0.000766	-0.000542	-0.000988
75%	2024-01-02 00:00:00	475.440000	477.060000	473.300000	476.160000	9.355980e+07	6.990000	2.470000	0.004494	0.005142	0.005660	0.005119
max	2024-10-01 00:00:00	574.380000	574.710000	570.420000	573.760000	2.524967e+08	19.620000	17.280000	0.040325	0.044066	0.045459	0.052650
std	NaN	53.154062	52.746088	53.388004	53.092185	2.875236e+07	2.946807	3.900274	0.009389	0.010100	0.011128	0.011192

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
389	2023-03-15	385.89	389.49	383.71	389.28	172996891	5.78	3.39	-0.017605	-0.006679	-0.017243	-0.002404
392	2023-03-10	390.99	393.16	384.32	385.91	189252984	8.84	-5.08	0.007095	0.009641	0.001427	0.024043
578	2022-06-13	379.85	381.81	373.30	375.00	170004891	8.51	-4.85	0.010240	0.007313	0.003022	0.007961
604	2022-05-05	424.55	425.00	409.44	413.81	172929109	15.56	-10.74	0.024590	0.009144	0.006005	0.032717
653	2022-02-24	411.02	428.76	410.64	428.30	213942891	18.12	17.28	-0.020738	-0.040247	-0.021588	-0.043272
673	2022-01-26	440.72	444.04	428.86	433.38	186391109	15.18	-7.34	0.005548	-0.001374	0.004962	0.005613
674	2022-01-25	433.06	439.72	427.15	434.47	167997297	12.57	1.41	-0.009729	-0.003987	0.002515	-0.017381
675	2022-01-24	432.03	440.38	420.76	439.84	252496703	19.62	7.81	0.001501	-0.014960	0.012360	-0.002378
676	2022-01-21	445.56	448.06	437.95	437.98	202271203	10.11	-7.58	0.017439	0.040855	-0.004229	0.031317

Analysis of SPY¶

Introduction¶

Data Curation¶

Initial Data Description¶

Exploratory Data Analysis¶

Conclusion One: T-test¶

Conclusion Two: ANOVA¶

Conclusion Three: Outlier Detection¶

Additional Testing: Pearson Correlation Coefficient¶

Primary Analysis¶

Visualization¶

Decision Tree¶

Actual vs. Predicted Values (Line)¶

Predicted vs Actual Difference¶

Predicted vs Actual (Scatter)¶

Insights and Conclusions¶

	0
Date	datetime64[ns]
Open	float64
High	float64
Low	float64
Close	float64
Volume	int64
High-Low	float64
Close-Open	float64
pct_change_high	float64
pct_change_low	float64
pct_change_close	float64
pct_change_open	float64

	Date	Open	High	Low	Close	Volume	High-Low	Close-Open	pct_change_high	pct_change_low	pct_change_close	pct_change_open
Date	1.000000	0.644639	0.642145	0.649665	0.645861	-0.400067	-0.276108	0.006401	-0.054895	-0.049748	-0.044890	-0.048574
Open	0.644639	1.000000	0.999106	0.998754	0.997305	-0.437498	-0.211269	-0.052544	0.006585	0.015253	0.011819	0.009967
High	0.642145	0.999106	1.000000	0.998531	0.998598	-0.427249	-0.191226	-0.022770	-0.001611	-0.001455	0.011257	-0.015460
Low	0.649665	0.998754	0.998531	1.000000	0.998842	-0.458294	-0.244123	-0.014649	-0.015819	0.005036	0.010660	-0.021424
Close	0.645861	0.997305	0.998598	0.998842	1.000000	-0.446101	-0.221959	0.020857	-0.030670	-0.019177	0.012274	-0.049353
Volume	-0.400067	-0.437498	-0.427249	-0.458294	-0.446101	1.000000	0.655519	-0.110165	0.172979	-0.021429	0.010631	0.116429
High-Low	-0.276108	-0.211269	-0.191226	-0.244123	-0.221959	0.655519	1.000000	-0.142173	0.257007	-0.116944	0.008281	0.111154
Close-Open	0.006401	-0.052544	-0.022770	-0.014649	0.020857	-0.110165	-0.142173	1.000000	-0.506001	-0.467746	0.006040	-0.805697
pct_change_high	-0.054895	0.006585	-0.001611	-0.015819	-0.030670	0.172979	0.257007	-0.506001	1.000000	0.736582	0.672693	0.766984
pct_change_low	-0.049748	0.015253	-0.001455	0.005036	-0.019177	-0.021429	-0.116944	-0.467746	0.736582	1.000000	0.711374	0.731918
pct_change_close	-0.044890	0.011819	0.011257	0.010660	0.012274	0.010631	0.008281	0.006040	0.672693	0.711374	1.000000	0.327441
pct_change_open	-0.048574	0.009967	-0.015460	-0.021424	-0.049353	0.116429	0.111154	-0.805697	0.766984	0.731918	0.327441	1.000000