Building your Long Term Portfolio using Unsupervised ML with Community Detection Analysis in Python & R

We finally get to the Portfolio Building where we can try to beat the SPY average returns Year over Yer (YoY). The whole code base can be found here:

To build the portfolio based on the correlation of stock prices in the S&P 500, we will use network centrality as a measure to understand how the stocks are better integrated with one and other.

Using this diagram below you can understand the process better.

stocks_cross_corr, _, _ = calculate_corr(df_stock_prices,1, len(df_stock_prices), 'pearson')
stocks_cross_corr = stocks_cross_corr[1]cor_thresold = 0.7
G = build_graph(stocks_cross_corr, cor_thresold)
partition = community.best_partition(G)
modularity = community.modularity(partition, G)
values = [partition.get(node) for node in G.nodes()]
plt.figure(figsize=(10,10))
nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)
print(modularity)
print("Total number of Communities=", len(G.nodes()))

dict_betwenness_centrality = nx.betweenness_centrality(G)
dict_degree_centrality = nx.degree_centrality(G)
dict_closeness_centrality = nx.closeness_centrality(G)
dict_eigenvector_centrality = nx.eigenvector_centrality(G)
print("dict_degree_centrality: ", dict_degree_centrality)
print("dict_closeness_centrality: ", dict_closeness_centrality)
print("dict_eigenvector_centrality: ", dict_eigenvector_centrality)
print("dict_betweenness_centrality: ", dict_betwenness_centrality)

Once the four types of centralities are obtained we will build our own portfolio determining algorithm using these centralities in an equal weightage sense. To do that we combine the dictionary data structure of our 4 centralties and use a for loop to iterate through each stock and add the four different centralities to create a final centrality metric. Since we are using equal weightage the formula would use a simple sum and not be any assigned any weights.

#Portfolio Formula: 
c_dict = dict([(k, [dict_betwenness_centrality[k], dict_eigenvector_centrality[k], dict_degree_centrality[k], dict_closeness_centrality[k] ]) for k in dict_betwenness_centrality])
#print(c_dict)

C_total = {}
for key in c_dict:
C_total[key] = sum(c_dict[key])


print("The Centrality total for stocks are:", C_total)

newDict = dict(filter(lambda elem: elem[1] > 0, C_total.items()))
print("Stocks greater than 0.3 centrality are",newDict)
print(len(newDict))

df_centrality = pd.DataFrame(list(newDict.items()),columns = ['Symbol','Centrality'])
df_centrality.sort_values(by='Centrality', ascending=False)
#df_centrality.head(20)
#type(df_centrality['Centrality'])
df_centrality.to_csv('centrality_of_stocks_0.7cor.csv',index=False)
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse)
library(tidyquant)
library(jsonlite)
library(tidyverse)
library(readr)
library(igraph)
library(dplyr)
library(lubridate)
library(data.table)
library(Quandl) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#DataLoad 
stockprices <- read.csv("../input/usstockprices/stocks_price_final.csv")
centrality0.5 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.5cor.csv")
centrality0.6 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.6cor.csv")
centrality0.7 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.7cor.csv")
#Merges Centtraility values with other potential useful information like market cap and sector for further filtering
stocks <- stockprices[!duplicated(stockprices$symbol), ] # removes duplicate symbols
Index0.5cor <- merge(stocks, centrality0.5, by.x = "symbol", by.y = "Symbol")
Index0.6cor <- merge(stocks, centrality0.6, by.x = "symbol", by.y = "Symbol")
Index0.7cor <- merge(stocks, centrality0.7, by.x = "symbol", by.y = "Symbol")
#head(Index0.6cor)
IndexGet0.5 <- as.character(Index0.5cor$symbol)
IndexGet0.6 <- as.character(Index0.6cor$symbol)
IndexGet0.7 <- as.character(Index0.7cor$symbol)
#Portfolio Testing

##0.5 correlation centrality prices
stockindex0.5 <- tq_get(IndexGet0.5, get="stock.prices", from = "2015-07-01",warnings = FALSE,
stringsAsFactors = FALSE) %>%
group_by(symbol) %>%
tq_transmute(select=adjusted,
mutate_fun=periodReturn,
period="monthly",
col_rename = "monthly_return")

stockindex0.5

##0.6 correlation centrality prices
stockindex0.6 <- tq_get(IndexGet0.6, get="stock.prices", from = "2015-07-01",warnings = FALSE,
stringsAsFactors = FALSE) %>%
group_by(symbol) %>%
tq_transmute(select=adjusted,
mutate_fun=periodReturn,
period="monthly",
col_rename = "monthly_return")


##0.7 correlation centrality prices
stockindex0.7 <- tq_get(IndexGet0.7, get="stock.prices", from = "2015-07-01",warnings = FALSE,
stringsAsFactors = FALSE) %>%
group_by(symbol) %>%
tq_transmute(select=adjusted,
mutate_fun=periodReturn,
period="monthly",
col_rename = "monthly_return")


##Base Portfolio to compare

baseline_returns_monthly <- "SPY" %>%
tq_get(get = "stock.prices",
from = "2015-07-01", warnings = FALSE,stringsAsFactors = FALSE) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "monthly",
col_rename = "spy_monthly_return")

baseline_returns_monthly
portfolio_returns_monthly0.5 <- stockindex0.5 %>%
tq_portfolio(assets_col = symbol,
returns_col = monthly_return,
col_rename = "portfolio-monthly")


portfolio_returns_monthly0.6 <- stockindex0.6 %>%
tq_portfolio(assets_col = symbol,
returns_col = monthly_return,
col_rename = "portfolio-monthly")


portfolio_returns_monthly0.7 <- stockindex0.7 %>%
tq_portfolio(assets_col = symbol,
returns_col = monthly_return,
col_rename = "portfolio-monthly")


#Portfolio Compare
stock0.5indexVSSPY <- left_join(portfolio_returns_monthly0.5,
baseline_returns_monthly,
by = "date")

stock0.6indexVSSPY <- left_join(portfolio_returns_monthly0.6,
baseline_returns_monthly,
by = "date")

stock0.7indexVSSPY <- left_join(portfolio_returns_monthly0.7,
baseline_returns_monthly,
by = "date")
#stock0.5indexVSSPY

ggplot(stock0.5indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")
ggplot(stock0.6indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")
ggplot(stock0.7indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")
##PLAYGROUND - LETS DO TRIAL AND ERROR HERE TO BEAT THE SPY GRAPH in RETURNS

centrality_filter <- Index0.6cor
#filter(Centrality > 0.8 & Centrality < 0.5)
centrality_filter

IndexGet0.6 <- as.character(centrality_filter$symbol)

stockindex0.6 <- tq_get(IndexGet0.6, get="stock.prices", from = "2015-07-01",warnings = FALSE,
stringsAsFactors = FALSE) %>%
group_by(symbol) %>%
tq_transmute(select=adjusted,
mutate_fun=periodReturn,
period="monthly",
col_rename = "monthly_return")

portfolio_returns_monthly0.6 <- stockindex0.6 %>%
tq_portfolio(assets_col = symbol,
returns_col = monthly_return,
col_rename = "portfolio-monthly")

stock0.6indexVSSPY <- left_join(portfolio_returns_monthly0.6,
baseline_returns_monthly,
by = "date")

ggplot(stock0.6indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")

This is the code I wrote to achieve a portfolio that could beat the SPY returns over a 5 year period from July 1st 2015 to July 22nd 2020

centrality_filter <- Index0.5cor %>%
filter((sector == "Health Care" & Centrality > 0.4) | (sector == "Technology" & Centrality > 0.42) | (sector == "Consumer Services" & Centrality > 0.49) | (sector == "Finance" & Centrality > 0.5) | (sector == "Transportation" & Centrality > 0.5) | (sector == "Capital Goods" & Centrality > 0.35) | (sector == "Miscellaneous" & Centrality > 0.5) | (sector == "Basic Industries" & Centrality > 0.39) | (sector == "Public Utilities" & Centrality > 0.36) | (sector == "Consumer Durables" & Centrality > 0.3)| (sector == "Consumer Non-Durables" & Centrality > 0.25))
centrality_filter

IndexGet0.5 <- as.character(centrality_filter$symbol)

stockindex0.5 <- tq_get(IndexGet0.5, get="stock.prices", from = "2015-07-01", till = "2020-07-22",warnings = FALSE,
stringsAsFactors = FALSE) %>%
group_by(symbol) %>%
tq_transmute(select=adjusted,
mutate_fun=periodReturn,
period="monthly",
col_rename = "monthly_return")

portfolio_returns_monthly0.5 <- stockindex0.5 %>%
tq_portfolio(assets_col = symbol,
returns_col = monthly_return,
col_rename = "portfolio-monthly")

stock0.5indexVSSPY <- left_join(portfolio_returns_monthly0.5,
baseline_returns_monthly,
by = "date")

stock0.5indexVSSPY$`portfolio-monthly` <- stock0.5indexVSSPY$`portfolio-monthly` * 100
stock0.5indexVSSPY$spy_monthly_return <- stock0.5indexVSSPY$spy_monthly_return * 100

stock0.5indexVSSPY <- stock0.5indexVSSPY[-c(14), ]

returns <- ggplot(stock0.5indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red") + ggtitle("Portfolio vs S&P 500 Returns over 5 years") + labs(y="Returns Percenatge", x = "Date")

plot(returns)
Blue Line is my Portfolio, Red line is SPY

The centralities as shown in the above code were chosen based on my predictions for future trends in terms of what sectors will do well and become important to society as a service.

CS, music and football in no particular order