# Building your Long Term Portfolio using Unsupervised ML with Community Detection Analysis in Python & R

To build the portfolio based on the correlation of stock prices in the S&P 500, we will use network centrality as a measure to understand how the stocks are better integrated with one and other.

`stocks_cross_corr, _, _ = calculate_corr(df_stock_prices,1, len(df_stock_prices), 'pearson')stocks_cross_corr = stocks_cross_corr[1]cor_thresold = 0.7G = build_graph(stocks_cross_corr, cor_thresold)partition = community.best_partition(G)modularity = community.modularity(partition, G)values = [partition.get(node) for node in G.nodes()]plt.figure(figsize=(10,10))nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)print(modularity)print("Total number of Communities=", len(G.nodes()))dict_betwenness_centrality = nx.betweenness_centrality(G)dict_degree_centrality = nx.degree_centrality(G)dict_closeness_centrality = nx.closeness_centrality(G)dict_eigenvector_centrality = nx.eigenvector_centrality(G)print("dict_degree_centrality: ", dict_degree_centrality)print("dict_closeness_centrality: ", dict_closeness_centrality)print("dict_eigenvector_centrality: ", dict_eigenvector_centrality)print("dict_betweenness_centrality: ", dict_betwenness_centrality)`

## Once the four types of centralities are obtained we will build our own portfolio determining algorithm using these centralities in an equal weightage sense. To do that we combine the dictionary data structure of our 4 centralties and use a for loop to iterate through each stock and add the four different centralities to create a final centrality metric. Since we are using equal weightage the formula would use a simple sum and not be any assigned any weights.

`#Portfolio Formula: c_dict = dict([(k, [dict_betwenness_centrality[k], dict_eigenvector_centrality[k], dict_degree_centrality[k], dict_closeness_centrality[k] ]) for k in dict_betwenness_centrality])#print(c_dict)        C_total = {}for key in c_dict:     C_total[key] = sum(c_dict[key])         print("The Centrality total for stocks are:", C_total)   newDict = dict(filter(lambda elem: elem[1] > 0, C_total.items()))print("Stocks greater than 0.3 centrality are",newDict)print(len(newDict))df_centrality = pd.DataFrame(list(newDict.items()),columns = ['Symbol','Centrality']) df_centrality.sort_values(by='Centrality', ascending=False)#df_centrality.head(20)#type(df_centrality['Centrality'])df_centrality.to_csv('centrality_of_stocks_0.7cor.csv',index=False)`
`# This R environment comes with many helpful analytics packages installed# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats# For example, here's a helpful package to loadlibrary(tidyverse)library(tidyquant)library(jsonlite)library(tidyverse)library(readr)library(igraph)library(dplyr)library(lubridate)library(data.table)library(Quandl) # metapackage of all tidyverse packages# Input data files are available in the read-only "../input/" directory# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directorylist.files(path = "../input")# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session`
`#DataLoad stockprices <- read.csv("../input/usstockprices/stocks_price_final.csv")centrality0.5 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.5cor.csv")centrality0.6 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.6cor.csv")centrality0.7 <- read.csv("../input/centrality-of-stocks-network-analysis/centrality_of_stocks_0.7cor.csv")`
`#Merges Centtraility values with other potential useful information like market cap and sector for further filteringstocks <- stockprices[!duplicated(stockprices\$symbol), ] # removes duplicate symbolsIndex0.5cor <- merge(stocks, centrality0.5, by.x = "symbol", by.y = "Symbol")Index0.6cor <- merge(stocks, centrality0.6, by.x = "symbol", by.y = "Symbol")Index0.7cor <- merge(stocks, centrality0.7, by.x = "symbol", by.y = "Symbol")#head(Index0.6cor)IndexGet0.5 <- as.character(Index0.5cor\$symbol)IndexGet0.6 <- as.character(Index0.6cor\$symbol)IndexGet0.7 <- as.character(Index0.7cor\$symbol)#Portfolio Testing##0.5 correlation centrality pricesstockindex0.5 <- tq_get(IndexGet0.5, get="stock.prices", from = "2015-07-01",warnings = FALSE,                             stringsAsFactors = FALSE) %>%  group_by(symbol) %>%  tq_transmute(select=adjusted,               mutate_fun=periodReturn,               period="monthly",               col_rename = "monthly_return")stockindex0.5##0.6 correlation centrality pricesstockindex0.6 <- tq_get(IndexGet0.6, get="stock.prices", from = "2015-07-01",warnings = FALSE,                             stringsAsFactors = FALSE) %>%  group_by(symbol) %>%  tq_transmute(select=adjusted,               mutate_fun=periodReturn,               period="monthly",               col_rename = "monthly_return")##0.7 correlation centrality pricesstockindex0.7 <- tq_get(IndexGet0.7, get="stock.prices", from = "2015-07-01",warnings = FALSE,                             stringsAsFactors = FALSE) %>%  group_by(symbol) %>%  tq_transmute(select=adjusted,               mutate_fun=periodReturn,               period="monthly",               col_rename = "monthly_return")##Base Portfolio to compare baseline_returns_monthly <- "SPY" %>%    tq_get(get  = "stock.prices",           from = "2015-07-01", warnings = FALSE,stringsAsFactors = FALSE) %>%    tq_transmute(select     = adjusted,                  mutate_fun = periodReturn,                  period     = "monthly",                  col_rename = "spy_monthly_return")baseline_returns_monthly`
`portfolio_returns_monthly0.5 <- stockindex0.5 %>%    tq_portfolio(assets_col  = symbol,                  returns_col = monthly_return,                  col_rename  = "portfolio-monthly")portfolio_returns_monthly0.6 <- stockindex0.6 %>%    tq_portfolio(assets_col  = symbol,                  returns_col = monthly_return,                  col_rename  = "portfolio-monthly")portfolio_returns_monthly0.7 <- stockindex0.7 %>%    tq_portfolio(assets_col  = symbol,                  returns_col = monthly_return,                  col_rename  = "portfolio-monthly")#Portfolio Comparestock0.5indexVSSPY <- left_join(portfolio_returns_monthly0.5,                                    baseline_returns_monthly,                                   by = "date")stock0.6indexVSSPY <- left_join(portfolio_returns_monthly0.6,                                    baseline_returns_monthly,                                   by = "date")stock0.7indexVSSPY <- left_join(portfolio_returns_monthly0.7,                                    baseline_returns_monthly,                                   by = "date")#stock0.5indexVSSPYggplot(stock0.5indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")ggplot(stock0.6indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")ggplot(stock0.7indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")`
`##PLAYGROUND - LETS DO TRIAL AND ERROR HERE TO BEAT THE SPY GRAPH in RETURNScentrality_filter <- Index0.6cor#filter(Centrality > 0.8 & Centrality < 0.5)centrality_filterIndexGet0.6 <- as.character(centrality_filter\$symbol)stockindex0.6 <- tq_get(IndexGet0.6, get="stock.prices", from = "2015-07-01",warnings = FALSE,                             stringsAsFactors = FALSE) %>%  group_by(symbol) %>%  tq_transmute(select=adjusted,               mutate_fun=periodReturn,               period="monthly",               col_rename = "monthly_return")portfolio_returns_monthly0.6 <- stockindex0.6 %>%    tq_portfolio(assets_col  = symbol,                  returns_col = monthly_return,                  col_rename  = "portfolio-monthly")stock0.6indexVSSPY <- left_join(portfolio_returns_monthly0.6,                                    baseline_returns_monthly,                                   by = "date")ggplot(stock0.6indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red")`

This is the code I wrote to achieve a portfolio that could beat the SPY returns over a 5 year period from July 1st 2015 to July 22nd 2020

`centrality_filter <- Index0.5cor %>%filter((sector == "Health Care" & Centrality > 0.4) | (sector == "Technology" & Centrality > 0.42) | (sector == "Consumer Services" & Centrality > 0.49) | (sector == "Finance" & Centrality > 0.5)  | (sector == "Transportation" & Centrality > 0.5) | (sector == "Capital Goods" & Centrality > 0.35) | (sector == "Miscellaneous" & Centrality > 0.5) | (sector == "Basic Industries" & Centrality > 0.39) | (sector == "Public Utilities" & Centrality > 0.36) | (sector == "Consumer Durables" & Centrality > 0.3)| (sector == "Consumer Non-Durables" & Centrality > 0.25))centrality_filterIndexGet0.5 <- as.character(centrality_filter\$symbol)stockindex0.5 <- tq_get(IndexGet0.5, get="stock.prices", from = "2015-07-01", till = "2020-07-22",warnings = FALSE,                             stringsAsFactors = FALSE) %>%  group_by(symbol) %>%  tq_transmute(select=adjusted,               mutate_fun=periodReturn,               period="monthly",               col_rename = "monthly_return")portfolio_returns_monthly0.5 <- stockindex0.5 %>%    tq_portfolio(assets_col  = symbol,                  returns_col = monthly_return,                  col_rename  = "portfolio-monthly")stock0.5indexVSSPY <- left_join(portfolio_returns_monthly0.5,                                    baseline_returns_monthly,                                   by = "date")stock0.5indexVSSPY\$`portfolio-monthly` <- stock0.5indexVSSPY\$`portfolio-monthly` * 100stock0.5indexVSSPY\$spy_monthly_return <- stock0.5indexVSSPY\$spy_monthly_return * 100stock0.5indexVSSPY <- stock0.5indexVSSPY[-c(14), ]returns <- ggplot(stock0.5indexVSSPY) + geom_line(aes(x = `date`, y = `portfolio-monthly`), color = "blue")+ geom_line(aes(x = `date`, y = `spy_monthly_return`), color = "red") + ggtitle("Portfolio vs S&P 500 Returns over 5 years") + labs(y="Returns Percenatge", x = "Date") plot(returns)`

The centralities as shown in the above code were chosen based on my predictions for future trends in terms of what sectors will do well and become important to society as a service.

--

--

--

## More from Aakash Kedia

CS, music and football in no particular order

Love podcasts or audiobooks? Learn on the go with our new app.

## Aakash Kedia

CS, music and football in no particular order