install.packages(c("dplyr","ggplot2"))
library(dplyr)
library(ggplot2)
Installing packages into ‘/work/.R/library’
(as ‘lib’ is unspecified)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
poinst<-data.frame("X"=c(0,1,1),"Y"=c(2,2,8))
modelobase <- mean(poinst$Y)
cat("Modelo base modelo: ",modelobase)
Modelo base modelo: 4
modelo_simple <- lm(data = poinst,formula = Y ~ X)
modelo_simple
SSE <- sum(modelo_simple$residuals^2)
cat("SSE es: ",SSE)
SSE es: 18
SST <- sum((poinst$Y-mean(poinst$Y))^2)
cat("SST es: ",SST)
SST es: 24
SSR <- SST - SSE
R2 <- SSR/SST
cat("R cuadrado es: ",R2)
summary(modelo_simple)
R cuadrado es: 0.25
R2 <- 0.81
n <- 180
k <- 18
R2a <- 1-((n-1)/(n-k-1))*(1-R2)
cat("R cuadrado ajustado es: ",R2a)
R cuadrado ajustado es: 0.7887578
moneyball <- read.csv("/work/baseball.csv")
glimpse(moneyball)
# Procedimiento para analizar variables tipo factor
# countColumn <- function(df,numColumn){
# headtitle <- unique(names(df))
# conteo <- (data.frame(col=moneyball[,c(headtitle[numColumn])]) %>% count(col))
# names(conteo)[1] <- headtitle[numColumn]
# return (conteo)
# }
# for(i in 1:length(unique(names(moneyball)))){
# print(countColumn(moneyball,i))
# }
ggplot(moneyball, aes(Team)) + geom_bar()
ggplot(moneyball, aes(League)) + geom_bar()
Rows: 902
Columns: 15
$ Team <chr> "ANA", "ARI", "ATL", "BAL", "BOS", "CHC", "CHW", "CIN", "…
$ League <chr> "AL", "NL", "NL", "AL", "AL", "NL", "AL", "NL", "AL", "NL…
$ Year <int> 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 200…
$ RS <int> 691, 818, 729, 687, 772, 777, 798, 735, 897, 923, 724, 74…
$ RA <int> 730, 677, 643, 829, 745, 701, 795, 850, 821, 906, 876, 74…
$ W <int> 75, 92, 88, 63, 82, 88, 83, 66, 91, 73, 66, 76, 93, 65, 8…
$ OBP <dbl> 0.327, 0.341, 0.324, 0.319, 0.334, 0.336, 0.334, 0.324, 0…
$ SLG <dbl> 0.405, 0.442, 0.412, 0.380, 0.439, 0.430, 0.451, 0.419, 0…
$ BA <dbl> 0.261, 0.267, 0.260, 0.248, 0.266, 0.261, 0.268, 0.262, 0…
$ Playoffs <int> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
$ RankSeason <int> NA, 5, 7, NA, NA, NA, NA, NA, 6, NA, NA, NA, 4, NA, NA, N…
$ RankPlayoffs <int> NA, 1, 3, NA, NA, NA, NA, NA, 4, NA, NA, NA, 4, NA, NA, N…
$ G <int> 162, 162, 162, 162, 161, 162, 162, 162, 162, 162, 162, 16…
$ OOBP <dbl> 0.331, 0.311, 0.314, 0.337, 0.329, 0.321, 0.334, 0.341, 0…
$ OSLG <dbl> 0.412, 0.404, 0.384, 0.439, 0.393, 0.398, 0.427, 0.455, 0…
primerCuartil <- quantile(moneyball$RS)[2]
cat("Primer cuartil(25%) es: ",primerCuartil)
Primer cuartil(25%) es: 641.25
moneyball$RD = moneyball$RS - moneyball$RA
diferenciaMinima <- which.min(moneyball$RD)
anioMenorDiferencia <- moneyball[diferenciaMinima, ]$Year
cat("Diferencia mínima es: ",diferenciaMinima,"\n")
cat("Año con menor diferencia es: ",anioMenorDiferencia)
# Grafico de dispersión para ver cuantos partidos debe ganar un equipo para llegar a los Playoffs
# m <- ggplot(moneyball, aes(x = W, y = Team,color = factor(Playoffs)))+
# geom_point() +
# scale_color_manual(values = c("#000000", "#FF2D00"), name = "Made Playoffs")
# m + xlab("Wins")
Diferencia mínima es: 896
Año con menor diferencia es: 1962
# Gráfico de dispersión que muestra la variable RD vs W para
# ggplot(moneyball, aes(x = RD,y = W,color = factor(Playoffs))) +
# geom_point() +
# scale_color_manual(values = c("#000000", "#FF2D00"), name = "Made Playof fs")
modelW <- lm(W ~ RD, data = moneyball)
r2ModelW <- summary(modelW)["r.squared"]
cat("El valor R cuadrado del modelo es: ",r2ModelW$r.squared)
El valor R cuadrado del modelo es: 0.8808104
minWin <- 95
valMinRD <-(minWin - modelW$coefficients[1])/modelW$coefficients[2]
cat("Valor mínimo de RD es: ",ceiling(valMinRD))
Valor mínimo de RD es: 134
modelRS = lm(RS~OBP+SLG, data=moneyball)
r2ModelRs <- summary(modelRS)["r.squared"]
cat("R cuadrado del modelo es: ",r2ModelRs$r.squared)
# Modelo de regresión con 3 variables
# modelRS = lm(RS~OBP+SLG+BA, data=moneyball)
# summary(modelRS)
R cuadrado del modelo es: 0.9295811
valObp <- c(0.339)
valSlg <- c(0.430)
dataTestRS <- data.frame("OBP"=valObp,"SLG"=valSlg)
str(dataTestRS)
predictRS <- predict(modelRS,newdata=dataTestRS)
cat("El valor predecido para RS es: ",predictRS)
'data.frame': 1 obs. of 2 variables:
$ OBP: num 0.339
$ SLG: num 0.43
El valor predecido para RS es: 804.987
modelRA = lm(RA~OOBP+OSLG, data=moneyball)
summary(modelRA)
valOobp <- c(0.307)
valOsgl<- c(0.373)
dataTestRA <- data.frame("OOBP"=valOobp,"OSLG"=valOsgl)
str(dataTestRA)
predictRA <- predict(modelRA,newdata=dataTestRA)
cat("El valor predecido para RA es: ",predictRA)
'data.frame': 1 obs. of 2 variables:
$ OOBP: num 0.307
$ OSLG: num 0.373
El valor predecido para RA es: 621.9258
ajustadoRS <- 805
ajustadoRA <- 622
ajustadoRD <- ajustadoRS-ajustadoRA
dataTestW<-data.frame("RD"=c(ajustadoRD))
valueWPredict <- predict(modelW,newdata=dataTestW)
cat("El valor predecido para victorias es: ",valueWPredict)
El valor predecido para victorias es: 100.2365