Tip: Jake Vanderplas has a great tutorial on using pandas!

#collapse-hide
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import permutations
%matplotlib inline
sns.set()

Note: Make random Data
np.random.seed(42)

def brownian_motion(mean,std,npts):
    return np.cumsum(np.random.normal(scale=std, size=npts)) + mean
    
num_stocks = 10
num_timesteps = 1000

letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
tickers = [''.join(x) for x in np.random.choice(list(letters), size=(num_stocks,3))]

dates = pd.date_range('2020-11-21', periods=num_timesteps, freq='D')

data = np.vstack([brownian_motion(mean, std, num_timesteps) for mean, std in zip(np.random.randint(50,200,num_stocks), np.random.randint(2,5, num_stocks))]).T

df = pd.DataFrame(data, columns=tickers, index=dates)
df = df[df > 0].dropna(axis=1)
df.head()
GTO KHU GZS XUD HXC VUB XLF LZV
1969-04-20 141.116771 97.403393 110.300592 104.750134 159.614557 106.114842 112.311045 101.060632
1969-04-21 136.712380 101.730706 110.860277 100.405943 161.325646 109.976118 108.934610 103.011746
1969-04-22 141.233292 98.615540 107.905621 99.246578 159.395447 109.840294 108.188974 103.533443
1969-04-23 142.725768 100.724268 108.421143 101.966275 160.113603 106.707104 108.881800 102.359047
1969-04-24 141.179876 104.067308 111.383620 98.116840 164.030552 109.869213 109.098647 105.953067

Tip: Use this to load csv’s from your own google drive
from google.colab import drive 
drive.mount('/content/gdrive')
!ls "gdrive/My Drive" # this line will look in the folder

df = pd.read_csv('gdrive/My Drive/data.csv') # put the full path to the file in google drive here if you have one
fig, ax = plt.subplots(1, figsize=(20,8))
df.plot(ax=ax)
plt.show()
df.diff().corr()
GTO KHU GZS XUD HXC VUB XLF LZV
GTO 1.000000 -0.034842 -0.008919 0.057757 -0.004012 -0.023837 -0.017131 -0.012034
KHU -0.034842 1.000000 -0.004939 0.026932 -0.013135 0.036544 0.027497 -0.025655
GZS -0.008919 -0.004939 1.000000 0.032693 -0.032603 0.063454 0.020113 0.010456
XUD 0.057757 0.026932 0.032693 1.000000 -0.010530 0.005832 -0.030495 0.034310
HXC -0.004012 -0.013135 -0.032603 -0.010530 1.000000 -0.021562 0.007714 -0.021141
VUB -0.023837 0.036544 0.063454 0.005832 -0.021562 1.000000 0.004669 0.017130
XLF -0.017131 0.027497 0.020113 -0.030495 0.007714 0.004669 1.000000 -0.012195
LZV -0.012034 -0.025655 0.010456 0.034310 -0.021141 0.017130 -0.012195 1.000000
e_val, e_vect = np.linalg.eig(df.diff().corr())

evect_df = pd.DataFrame(e_vect[np.argsort(e_val)[::-1]], columns=df.columns, index=df.columns)
evect_df
GTO KHU GZS XUD HXC VUB XLF LZV
GTO -0.462612 0.286135 -0.307235 -0.000519 -0.609454 0.470913 0.000300 0.128569
KHU -0.386584 0.551908 0.043692 0.485364 0.097528 -0.506980 -0.066881 -0.193522
GZS -0.146585 -0.366611 -0.130274 0.009367 -0.124167 -0.071223 -0.827676 -0.348587
XUD 0.260056 0.568043 -0.141505 -0.627165 0.046178 -0.006299 -0.109068 -0.426931
HXC 0.223860 0.091539 -0.431629 -0.137911 -0.149098 -0.488327 -0.224082 0.652133
VUB -0.298582 0.233458 0.288640 -0.178202 0.531979 0.319270 -0.394092 0.448368
XLF 0.518531 0.270936 0.541743 0.282665 -0.413420 0.133974 -0.289216 0.108735
LZV -0.373213 -0.141987 0.550999 -0.490227 -0.354058 -0.397183 0.097036 0.064043
fig, ax = plt.subplots(1, figsize=(12,10))
ax.set_title('Eigenvalues of Correlation of Running Difference', fontsize=16)
sns.heatmap(evect_df, ax=ax, annot=True, fmt=".2f",  linewidths=.5)
fig.savefig('../images/eigen_correlation_heatmap.png')
plt.show()