# load packages
if (!require(pacman)) install.packages("pacman")
pacman::p_load(pacman, RPostgreSQL, ggplot2, tidyverse, dplyr)
tryCatch({
drv <- dbDriver("PostgreSQL")
print("Connecting to Database…")
connec <- dbConnect(drv,
dbname = "mydb",
host = "192.168.0.151",
port = "5432",
user = "postgres",
password = "headband")
print("Database Connected!")
},
error=function(cond) {
print("Unable to connect to Database.")
})
[1] "Connecting to Database…" [1] "Database Connected!"
df <- dbGetQuery(connec, "SELECT * FROM \"SP500LM\"")
tail(df)
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
497 | XYL | 3.135184 | 5.505256 | 0.0130 | 78.166 | 16337443840 | -0.0007685965 | 0 | 0.1362332 | 7.078947 | Industrials | 2.500000 |
498 | YUM | 4.847612 | 0.000000 | 0.0186 | 0.000 | 32212383744 | 0.0499569186 | 0 | 0.2329258 | 3.096189 | Consumer Cyclical | 1.857143 |
499 | ZBH | 2.855887 | 1.838131 | 0.0087 | 53.098 | 22555797504 | -0.0040788575 | 0 | 0.1954320 | 7.193858 | Healthcare | 2.500000 |
500 | ZBRA | 2.560271 | 6.823679 | 0.0000 | 43.221 | 14624266240 | 0.1008117913 | 0 | 0.1862731 | 213.800000 | Technology | 2.666667 |
501 | ZION | 2.823184 | 1.163843 | 0.0279 | 0.000 | 8190056448 | 0.0413673890 | 0 | 0.2915183 | 0.000000 | Financial Services | 2.666667 |
502 | ZTS | 9.156736 | 19.260940 | 0.0071 | 144.912 | 72255807488 | 0.1010567650 | 0 | 0.2992832 | 9.879464 | Healthcare | 1.250000 |
ggplot(df, aes(x=sector, y=priceToSales)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
ggplot(df, aes(x=sector, y=dividendYield)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
ggplot(df, aes(x=sector, y=opMarg)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
test_cyc = subset(df, sector == "Consumer Defensive")
tail(test_cyc[order(test_cyc$intCov),])
ggplot(df, aes(x=sector, y=intCov)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
157 | EL | 4.3433313 | 14.333968 | 0.0088 | 110.523 | 78666416128 | 0.05821139 | 0.000000000 | 0.18500557 | 20.98844 | Consumer Defensive | 1.727273 |
225 | HRL | 2.0136251 | 3.810466 | 0.0211 | 45.212 | 25088436224 | 0.06034583 | 0.005112769 | 0.10816551 | 23.13561 | Consumer Defensive | 3.000000 |
12 | ADM | 0.4843027 | 2.191079 | 0.0193 | 50.788 | 45700739072 | 0.09830687 | 0.053573828 | -0.02959775 | 24.88679 | Consumer Defensive | 2.600000 |
366 | PG | 3.8900514 | 6.901905 | 0.0263 | 67.215 | 311931535360 | 0.05811584 | 0.442118260 | 0.23009448 | 38.09339 | Consumer Defensive | 1.833333 |
110 | COST | 0.9699068 | 13.456687 | 0.0067 | 44.050 | 210991595520 | 0.11479989 | 0.000000000 | 0.04531590 | 52.38596 | Consumer Defensive | 1.800000 |
307 | MNST | 8.0988750 | 9.107746 | 0.0000 | 0.500 | 47103950848 | 0.13324502 | 0.004730752 | 0.26888066 | 392.81342 | Consumer Defensive | 2.500000 |
test_cansg = subset(df, sector == "Technology")
tail(test_cansg[order(test_cansg$CANSG),])
ggplot(df, aes(x=sector, y=CANSG)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
36 | ANET | 11.229631 | 2.647189 | 0.000 | 1.384 | 35458023424 | 0.11069462 | 0.4421183 | 0.3237822 | 0.000000 | Technology | 2.363636 |
185 | FFIV | 3.426164 | 4.481531 | 0.000 | 26.674 | 9174612992 | 0.06396900 | 0.4421183 | 0.3036429 | 0.000000 | Technology | 2.625000 |
263 | KLAC | 5.246267 | 15.692973 | 0.011 | 476.075 | 48327995392 | 0.26324296 | 0.4421183 | 0.2914832 | 0.000000 | Technology | 2.222222 |
281 | LRCX | 3.228145 | 10.485836 | 0.012 | 79.731 | 55611383808 | 0.21374443 | 0.4421183 | 0.2414883 | 0.000000 | Technology | 2.150000 |
293 | MCHP | 5.192484 | 3.244548 | 0.016 | 133.170 | 35417415680 | 0.08434484 | 0.4421183 | 0.3437371 | 11.061089 | Technology | 1.857143 |
471 | VRSN | 14.536477 | 0.000000 | 0.000 | 0.000 | 19964049408 | 0.02997862 | 0.4421183 | 0.5928969 | 9.694937 | Technology | 3.000000 |
test_carg = subset(df, sector == "Healthcare")
tail(test_carg[order(test_carg$CARG),])
ggplot(df, aes(x=sector, y=CARG)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
105 | CNC | 0.3568911 | 1.762078 | 0.0000 | 83.680 | 46481850368 | 0.2773785 | 0.000000000 | 0.03277978 | 6.323308 | Healthcare | 1.818182 |
150 | DXCM | 3.0449908 | 4.267568 | 0.0000 | 94.751 | 8140478464 | 0.3338239 | 0.000000000 | 0.19001796 | 4.411765 | Healthcare | 2.250000 |
472 | VRTX | 9.5678800 | 8.620013 | 0.0000 | 7.999 | 76041682944 | 0.3544536 | -0.006057656 | 0.41676951 | 42.983740 | Healthcare | 2.300000 |
95 | CI | 0.5371745 | 2.022405 | 0.0163 | 73.102 | 95073427456 | 0.4070775 | 0.000000000 | 0.06126098 | 5.531538 | Healthcare | 2.117647 |
314 | MRNA | 2.1643136 | 18.782501 | 0.0000 | 5.546 | 48913485824 | 0.4070775 | 0.000000000 | -0.06295800 | 392.813417 | Healthcare | 3.200000 |
392 | REGN | 4.7639020 | 6.895521 | 0.0000 | 13.560 | 78642970624 | 0.4070775 | 0.000000000 | 0.38647089 | 123.582897 | Healthcare | 2.692308 |
test_marketCap = subset(df, sector == "Technology")
tail(test_marketCap[order(test_marketCap$marketCap),])
ggplot(df, aes(x=sector, y=marketCap)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
353 | ORCL | 3.957854 | 20.461940 | 0.0164 | 0.000 | 25.84706 | 0.024160905 | -0.001252388 | 0.3304686 | 3.462432 | Technology | 2.700000 |
116 | CSCO | 3.368790 | 4.445404 | 0.0335 | 25.985 | 25.88097 | 0.003276903 | -0.003692556 | 0.3012853 | 35.608295 | Technology | 2.684211 |
48 | AVGO | 6.306811 | 7.797709 | 0.0306 | 188.023 | 25.96589 | 0.096382079 | 0.000000000 | 0.4653466 | 7.301857 | Technology | 1.200000 |
341 | NVDA | 10.590220 | 4.607480 | 0.0009 | 45.008 | 26.46898 | 0.318381540 | 0.440677729 | 0.3607758 | 38.593220 | Technology | 2.000000 |
318 | MSFT | 9.106365 | 13.470004 | 0.0088 | 47.075 | 28.22187 | 0.163571170 | 0.034525303 | 0.4361364 | 43.495359 | Technology | 1.592593 |
4 | AAPL | 6.015734 | 34.274002 | 0.0057 | 205.984 | 28.47747 | 0.113028291 | 0.126278356 | 0.2841341 | 39.333837 | Technology | 2.050000 |
Warning message: “Removed 10 rows containing non-finite values (stat_boxplot).”
test_debtToEquity = subset(df, sector == "Healthcare")
tail(test_debtToEquity[order(test_debtToEquity$debtToEquity),])
ggplot(df, aes(x=sector, y=debtToEquity)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
477 | WAT | 5.9457110 | 45.627945 | 0.0000 | 408.595 | 17051710464 | 0.048048096 | 0.003311143 | 0.27985865 | 16.629000 | Healthcare | 2.750000 |
5 | ABBV | 4.2414446 | 18.573164 | 0.0393 | 451.440 | 243242598400 | 0.197111448 | 0.000000000 | 0.39994163 | 9.400330 | Healthcare | 2.583333 |
147 | DVA | 0.7569899 | 7.403483 | 0.0000 | 493.730 | 8793460736 | 0.005998282 | -0.163934017 | 0.17271161 | 6.902272 | Healthcare | 3.000000 |
6 | ABC | 0.1256231 | 0.000000 | 0.0126 | 857.149 | 28851394560 | 0.084103101 | 0.008274333 | 0.01138503 | 14.607908 | Healthcare | 2.272727 |
32 | AMGN | 4.9194274 | 14.165639 | 0.0314 | 994.450 | 129459642368 | 0.030389266 | 0.008984578 | 0.40917516 | 7.736842 | Healthcare | 2.777778 |
322 | MTD | 6.7218914 | 45.627945 | 0.0000 | 994.450 | 25982576640 | 0.081916215 | 0.000000000 | 0.21809499 | 21.017182 | Healthcare | 3.250000 |
test_priceToBook = subset(df, sector == "Healthcare")
tail(test_priceToBook[order(test_priceToBook$priceToBook),])
ggplot(df, aes(x=sector, y=priceToBook)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
symbol | priceToSales | priceToBook | dividendYield | debtToEquity | marketCap | CARG | CANSG | opMarg | intCov | sector | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | |
502 | ZTS | 9.1567360 | 19.26094 | 0.0071 | 144.912 | 72255807488 | 0.10105677 | 0.000000000 | 0.2992832 | 9.879464 | Healthcare | 1.250000 |
276 | LLY | 10.3813930 | 41.93240 | 0.0119 | 174.931 | 304411508736 | 0.09625590 | -0.033745586 | 0.2487467 | 21.367569 | Healthcare | 2.100000 |
233 | IDXX | 8.6950810 | 45.39002 | 0.0000 | 200.624 | 28469450752 | 0.13253976 | 0.005644331 | 0.2114871 | 25.347088 | Healthcare | 2.125000 |
215 | HCA | 0.9975712 | 45.62794 | 0.0105 | 0.000 | 59959013376 | 0.07968804 | 0.000000000 | 0.1561414 | 5.720945 | Healthcare | 1.791667 |
322 | MTD | 6.7218914 | 45.62794 | 0.0000 | 994.450 | 25982576640 | 0.08191622 | 0.000000000 | 0.2180950 | 21.017182 | Healthcare | 3.250000 |
477 | WAT | 5.9457110 | 45.62794 | 0.0000 | 408.595 | 17051710464 | 0.04804810 | 0.003311143 | 0.2798586 | 16.629000 | Healthcare | 2.750000 |
ggplot(df, aes(x=sector, y=rating)) + geom_boxplot() + theme(axis.text.x = element_text(angle=45))
ggplot(df, aes(x=rating)) +
geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
colnames(df)
set.seed(1)
#create ID column
df$id <- 1:nrow(df)
#use 70% of dataset as training set and 30% as test set
train <- df %>% dplyr::sample_frac(0.70)
test <- dplyr::anti_join(df, train, by = 'id')
model <- lm(rating ~ priceToSales+priceToBook+dividendYield+debtToEquity+marketCap+CARG+CANSG+opMarg+intCov+ sector-1, train)
summary(model)
Call: lm(formula = rating ~ priceToSales + priceToBook + dividendYield + debtToEquity + marketCap + CARG + CANSG + opMarg + intCov + sector - 1, data = train) Residuals: Min 1Q Median 3Q Max -1.52975 -0.33580 -0.05325 0.33495 1.23409 Coefficients: Estimate Std. Error t value Pr(>|t|) priceToSales 1.006e-02 1.165e-02 0.863 0.389 priceToBook -2.727e-03 3.646e-03 -0.748 0.455 dividendYield 1.014e+01 2.046e+00 4.957 1.14e-06 *** debtToEquity 2.532e-04 1.593e-04 1.589 0.113 marketCap -2.164e-13 1.886e-13 -1.147 0.252 CARG -5.045e-01 2.704e-01 -1.865 0.063 . CANSG -2.104e-01 2.159e-01 -0.974 0.331 opMarg -3.070e-02 2.533e-01 -0.121 0.904 intCov 6.526e-04 4.298e-04 1.518 0.130 sectorBasic Materials 2.138e+00 1.331e-01 16.070 < 2e-16 *** sectorCommunication Services 1.994e+00 1.462e-01 13.637 < 2e-16 *** sectorConsumer Cyclical 2.169e+00 9.321e-02 23.272 < 2e-16 *** sectorConsumer Defensive 2.219e+00 1.204e-01 18.433 < 2e-16 *** sectorEnergy 1.816e+00 1.524e-01 11.913 < 2e-16 *** sectorFinancial Services 2.209e+00 1.068e-01 20.690 < 2e-16 *** sectorHealthcare 2.241e+00 9.715e-02 23.067 < 2e-16 *** sectorIndustrials 2.180e+00 7.953e-02 27.413 < 2e-16 *** sectorReal Estate 1.684e+00 1.644e-01 10.244 < 2e-16 *** sectorTechnology 2.200e+00 1.008e-01 21.836 < 2e-16 *** sectorUtilities 2.117e+00 1.321e-01 16.028 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4644 on 331 degrees of freedom Multiple R-squared: 0.9639, Adjusted R-squared: 0.9617 F-statistic: 441.4 on 20 and 331 DF, p-value: < 2.2e-16
modelResiduals <- as.data.frame(residuals(model))
ggplot(modelResiduals, aes(residuals(model))) +
geom_histogram(fill='deepskyblue', color='black')
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# test model on new data
preds <- predict(sp500model, test)
modelEval <- cbind(test$rating, preds)
colnames(modelEval) <- c('Actual', 'Predicted')
modelEval <- as.data.frame(modelEval)
head(modelEval)
Actual | Predicted | |
---|---|---|
<dbl> | <dbl> | |
1 | 2.583333 | 2.587791 |
2 | 2.272727 | 2.545884 |
3 | 2.666667 | 2.265010 |
4 | 1.733333 | 2.357230 |
5 | 2.266667 | 2.357168 |
6 | 1.961538 | 2.158867 |
mse <- mean((modelEval$Actual - modelEval$Predicted)**2)
rmse <- sqrt(mse)
df$lmPred = predict(sp500model, df)
dbWriteTable(connec,"SP500LM",df,overwrite = TRUE, row.names = FALSE)