S&P 500 Screener Project - Linear Model Feature Selection¶

Python: Linear Regression Feature Selection¶

  • Dependent variable: recommendation score 1-5 ( 1=strong_buy, 2=buy, 3=neutral, 4=under_perform, 5=sell )
  • Independent variables: p/s, p/b, div yield, 5-yr CompAvg RevGrowth%, 5-yr CompAvg NumSharesGrowth%, med oper marg%, int coverage, d/e, log(market cap)
  • Fixed effects: sector
In [64]:
import pandas as pd
import numpy as np
import yfinance as yf
from tqdm import notebook
from bs4 import BeautifulSoup
from datetime import timedelta
from scipy.stats.mstats import winsorize
import requests
import os
In [14]:
# Set current directory
os.chdir('/home/jovyan/work')

# Pandas settings
pd.set_option("display.max_columns", None)
In [15]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:headband@192.168.0.151:5432/mydb')
In [16]:
sql_tables = ["Info","Income","BalanceSheet","CashFlow","Recommendation"]
dfs = []
for i in range(len(sql_tables)):
    df = pd.read_sql_table(f"SP500{sql_tables[i]}", con=engine)
    dfs.append(df)
In [17]:
info_df = dfs[0].copy()
income_df = dfs[1].copy()
balance_df = dfs[2].copy()
cash_df = dfs[3].copy()
recomm_df = dfs[4].copy()
In [77]:
lm_df = info_df[['symbol','priceToSalesTrailing12Months','priceToBook','dividendYield','debtToEquity','marketCap']].sort_values(by='symbol')
lm_df.reset_index(drop=True, inplace=True)
In [78]:
# Compound Annual Revenue Growth

idx_income_years = income_df.groupby(['Ticker'])['Date'].agg(['idxmin', 'idxmax'])
first_income_years = income_df.loc[idx_income_years['idxmin']]
last_income_years = income_df.loc[idx_income_years['idxmax']]

merged_income_years = first_income_years.merge(
    last_income_years,
    on='Ticker',
    suffixes=('_first', '_last')
)
merged_income_years['diff'] = ((merged_income_years['Date_last'] - merged_income_years['Date_first']) / np.timedelta64(1, 'M')) / 12
lm_df['CARG'] = np.power(
merged_income_years['Total Revenue_last'] / merged_income_years['Total Revenue_first'],
(1 / merged_income_years['diff'])
) - 1
In [79]:
# Compound Annual Number of Share Growth

idx_balance_years = balance_df.groupby(['Ticker'])['Date'].agg(['idxmin', 'idxmax'])
first_balance_years = balance_df.loc[idx_balance_years['idxmin']]
last_balance_years = balance_df.loc[idx_balance_years['idxmax']]

merged_balance_years = first_balance_years.merge(
    last_balance_years,
    on='Ticker',
    suffixes=('_first', '_last')
)
merged_balance_years['diff'] = ((merged_balance_years['Date_last'] - merged_balance_years['Date_first']) / np.timedelta64(1, 'M')) / 12
lm_df['CANSG'] = np.power(
merged_balance_years['Common Stock_last'] / merged_balance_years['Common Stock_first'],
(1 / merged_balance_years['diff'])
) - 1
In [80]:
# 4 yr Average Operating Margin

merged_finance = pd.merge(cash_df, income_df, how='left', left_on=['Ticker', 'Date'], right_on = ['Ticker','Date'])
merged_finance['Oper Margin'] = merged_finance['Total Cash From Operating Activities']/merged_finance['Total Revenue']
lm_df['opMarg'] = merged_finance.groupby('Ticker')['Oper Margin'].mean().reset_index(drop=True)
In [81]:
# Interest Coverage

merged_finance['Interest Coverage'] = merged_finance['Total Cash From Operating Activities']/np.absolute(merged_finance['Interest Expense'])
idx_years = merged_finance.groupby(['Ticker'])['Date'].agg(['idxmax'])
lm_df['intCov'] = merged_finance.loc[idx_years['idxmax']]['Interest Coverage'].reset_index(drop=True)
In [82]:
# Test
merged_finance[merged_finance['Ticker'] == 'MNST'][['Date','Total Cash From Operating Activities','Interest Expense']]
Out[82]:
Date Total Cash From Operating Activities Interest Expense
1284 2021-12-31 1.155741e+09 -19000.0
1285 2020-12-31 1.364163e+09 -39000.0
1286 2019-12-31 1.113762e+09 -56000.0
1287 2018-12-31 1.161881e+09 -60000.0
In [83]:
# Sector

lm_df['sector'] = info_df.sort_values('symbol')['sector'].reset_index(drop=True)
In [84]:
def set_recommendation(score):
    if score in ['Strong Buy','Buy']:
        return 1
    elif score in ['Market Outperform','Overweight','Perform','Positive','Sector Outperform','Outperform']:
        return 2
    elif score in ['Peer Perform','In-Line','Sector Perform','Sector Weight','Market Perform','Equal-Weight','Hold','Neutral','Mixed']:
        return 3
    elif score in ['Sector Underperform','Underperform','Underweight']:
        return 4
    else:
        return 5
    
recomm_df['Rating'] = recomm_df['To Grade'].apply(set_recommendation)
In [85]:
# Check for missing values and set recommendation score

test = recomm_df.groupby('Ticker')['Rating'].mean()
test_set = set(test.reset_index()['Ticker'].unique())
lm_set = set(lm_df['symbol'].unique())
missingRecs = list(lm_set.difference(test_set))

if len(missingRecs) > 0:
    print('Fixing missing recommendations...')
    df_fix = recomm_df.groupby('Ticker')['Rating'].mean()
    for i in range(len(missingRecs)):
        df_fix.loc[missingRecs[i]] = 3
    df_fix = df_fix.reset_index()
    lm_df['rating'] = df_fix.sort_values('Ticker')['Rating'].reset_index(drop=True)
    print('Fix complete')
else:
    lm_df['rating'] = recomm_df['Rating']
    print('Looks good')
Fixing missing recommendations...
Fix complete
In [86]:
# rename col and zero out nan values
lm_df.rename(columns={'priceToSalesTrailing12Months':'priceToSales'},inplace = True)
lm_df.fillna(0,inplace = True)
# winsorize outliers
win_lim = (0.01,0.02)
lm_df['intCov'] = winsorize(lm_df['intCov'],win_lim)
lm_df['CANSG'] = winsorize(lm_df['CANSG'],win_lim)
lm_df['CARG'] = winsorize(lm_df['CARG'],win_lim)
lm_df['debtToEquity'] = winsorize(lm_df['debtToEquity'],win_lim)
lm_df['priceToBook'] = winsorize(lm_df['priceToBook'],win_lim)
lm_df['opMarg'] = winsorize(lm_df['opMarg'],win_lim)
lm_df['dividendYield'] = winsorize(lm_df['dividendYield'],win_lim)

lm_df.tail()
Out[86]:
symbol priceToSales priceToBook dividendYield debtToEquity marketCap CARG CANSG opMarg intCov sector rating
497 YUM 4.847612 0.000000 0.0186 0.000 3.221238e+10 0.049957 0.0 0.232926 3.096189 Consumer Cyclical 1.857143
498 ZBH 2.855887 1.838131 0.0087 53.098 2.255580e+10 -0.004079 0.0 0.195432 7.193858 Healthcare 2.500000
499 ZBRA 2.560271 6.823679 0.0000 43.221 1.462427e+10 0.100812 0.0 0.186273 213.800000 Technology 2.666667
500 ZION 2.823184 1.163843 0.0279 0.000 8.190056e+09 0.041367 0.0 0.291518 0.000000 Financial Services 2.666667
501 ZTS 9.156736 19.260940 0.0071 144.912 7.225581e+10 0.101057 0.0 0.299283 9.879464 Healthcare 1.250000
In [66]:
# Load
def update_server(table_name,dataframe):  
    try:
        dataframe.to_sql(table_name,con=engine,index=False,if_exists='replace')
        print(f'Successfully imported {table_name} to server')
    except Exception as err:
        print('Got an error while updating server')
        print('Error: ', str(err))
In [87]:
update_server('SP500LM',lm_df)
Successfully imported SP500LM to server