library(ggplot2)
library(dplyr)
library(openintro)
library(DescTools)clase 08 apoyo
Conjunto de Datos Loan 50
Variable a estudiar “interest_rate”
Obtener Estadísticos Loan 50. Interest Rate
loan50$interest_rate [1] 10.90 9.92 26.30 9.92 9.43 9.92 17.09 6.08 7.97 12.62 17.09 5.31
[13] 7.35 5.31 7.96 24.85 18.06 10.42 7.96 19.42 14.08 20.00 9.44 9.92
[25] 10.91 5.32 6.71 15.04 11.98 12.62 10.91 9.44 9.93 7.35 18.45 17.09
[37] 7.96 6.08 6.71 7.34 12.62 16.02 10.90 9.93 9.44 10.42 21.45 10.91
[49] 9.43 6.08
sort(loan50$interest_rate) [1] 5.31 5.31 5.32 6.08 6.08 6.08 6.71 6.71 7.34 7.35 7.35 7.96
[13] 7.96 7.96 7.97 9.43 9.43 9.44 9.44 9.44 9.92 9.92 9.92 9.92
[25] 9.93 9.93 10.42 10.42 10.90 10.90 10.91 10.91 10.91 11.98 12.62 12.62
[37] 12.62 14.08 15.04 16.02 17.09 17.09 17.09 18.06 18.45 19.42 20.00 21.45
[49] 24.85 26.30
max(loan50$interest_rate)[1] 26.3
min(loan50$interest_rate)[1] 5.31
abs(max(loan50$interest_rate)- min(loan50$interest_rate))[1] 20.99
mean(loan50$interest_rate)[1] 11.5672
median(loan50$interest_rate)[1] 9.93
Mode(loan50$interest_rate)[1] 9.92
attr(,"freq")
[1] 4
Dot Plot (gráfico de puntos) Loan 50
plot(x= loan50$interest_rate,
y=rep(0,length(loan50$interest_rate)),
main='Plot de puntos: Tasa de interés',
xlab='%',
ylab='',
pch= 1)
par(new=TRUE)
points(mean(loan50$interest_rate),
-.05,
pch = 17,
col = "red",
cex = 2)Nota: triángulo rojo representa el promedio.
Se usa función plot y se superponen dos gráficos
Modificar Conjunto de Datos Loan 50
Redondear variable interest_rate sin décimales
loan50_redondeado <- loan50%>%
mutate(interest_rate= round(interest_rate,0 ))
head(loan50_redondeado[,c(1,10:14)])# A tibble: 6 × 6
state num_cc_carrying_balance loan_purpose loan_amount grade interest_rate
<fct> <int> <fct> <int> <fct> <dbl>
1 NJ 8 debt_consolidat… 22000 B 11
2 CA 2 credit_card 6000 B 10
3 SC 14 debt_consolidat… 25000 E 26
4 CA 10 credit_card 6000 B 10
5 OH 2 home_improvement 25000 B 9
6 IN 4 home_improvement 6400 B 10
Obtener Estadísticos Loan 50 redondeado. Interest Rate
loan50_redondeado$interest_rate [1] 11 10 26 10 9 10 17 6 8 13 17 5 7 5 8 25 18 10 8 19 14 20 9 10 11
[26] 5 7 15 12 13 11 9 10 7 18 17 8 6 7 7 13 16 11 10 9 10 21 11 9 6
sort(loan50_redondeado$interest_rate) [1] 5 5 5 6 6 6 7 7 7 7 7 8 8 8 8 9 9 9 9 9 10 10 10 10 10
[26] 10 10 10 11 11 11 11 11 12 13 13 13 14 15 16 17 17 17 18 18 19 20 21 25 26
max(loan50_redondeado$interest_rate)[1] 26
min(loan50_redondeado$interest_rate)[1] 5
abs(max(loan50_redondeado$interest_rate)-min(loan50_redondeado$interest_rate))[1] 21
mean(loan50_redondeado$interest_rate)[1] 11.48
median(loan50_redondeado$interest_rate)[1] 10
Mode(loan50_redondeado$interest_rate)[1] 10
attr(,"freq")
[1] 8
Dot Plot Stacked (apilado)
ggplot(loan50_redondeado,
aes(x = interest_rate)) +
geom_dotplot()+
scale_y_continuous(NULL, breaks = NULL)Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
¿Cuál es la moda?
¿Regiones de mayor densidad?
Dot Plot Stacked con media
ggplot(loan50_redondeado,
aes(x = interest_rate)) +
geom_dotplot()+
scale_y_continuous(NULL, breaks = NULL)+
annotate("pointrange",
x = mean(loan50$interest_rate),
y = -.01,
ymin = -.01,
ymax = .2,
colour = "red")Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
Dot Plot Stacked con Límites
ggplot(loan50_redondeado,
aes(x = interest_rate)) +
geom_dotplot()+
scale_y_continuous(NULL, breaks = NULL)+
annotate("pointrange",
x = min(loan50_redondeado$interest_rate),
y = -.01,
ymin = -.01,
ymax = .5,
colour = "blue")+
annotate("pointrange",
x = max(loan50_redondeado$interest_rate),
y = -.01,
ymin = -.01,
ymax = .5,
colour = "blue")Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
Rango Loan 50 redondeado
max(loan50_redondeado$interest_rate)-min(loan50_redondeado$interest_rate)[1] 21
Sesgos en Loan 50 Redondeado
ggplot(data= loan50_redondeado, aes(x=interest_rate))+
geom_histogram(bins=8,
fill='blue')ggplot(data= loan50_redondeado, aes(x=interest_rate))+
geom_histogram(bins=12,
fill='blue')Estadísticos Loan 50 -II
sd(loan50$interest_rate)[1] 5.052115
summary(loan50$interest_rate) Min. 1st Qu. Median Mean 3rd Qu. Max.
5.31 7.96 9.93 11.57 13.71 26.30
Conjunto de Datos Babies
Selección muestra aleatória variable “age”
set.seed(5313)
babies_subset <- babies%>%
sample_n(50)Estadísticos babies_subset age
babies_subset$age [1] 30 20 25 30 25 37 37 39 24 26 29 22 27 26 26 23 38 25 30 21 24 23 22 23 20
[26] 31 25 33 31 35 19 24 19 25 40 21 21 39 25 24 39 24 31 26 26 37 22 37 36 19
min(babies_subset$age)[1] 19
max(babies_subset$age)[1] 40
abs(max(babies_subset$age)-min(babies_subset$age))[1] 21
summary(babies_subset$age) Min. 1st Qu. Median Mean 3rd Qu. Max.
19.00 23.00 25.50 27.62 31.00 40.00
mean(babies_subset$age)[1] 27.62
median(babies_subset$age)[1] 25.5
Dot Plot Stacked babies_subset age
ggplot(data= babies_subset) +
geom_dotplot(aes(x = age))+
scale_y_continuous(NULL, breaks = NULL)Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
ggplot(data= babies_subset) +
geom_dotplot(aes(x = age),
fill = "steelblue") +
scale_y_continuous(NULL, breaks = NULL)+
annotate('point',
x=mean(babies_subset$age),
y=-.01,
col='red')Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
Variaciones en Bins del Histográma babies_subset
ggplot(data=babies_subset,
aes(x=age))+
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=babies_subset,
aes(x=age))+
geom_histogram(bins = 15)ggplot(data=babies_subset,
aes(x=age))+
geom_histogram(bins = 10)ggplot(data=babies,
aes(x=age))+
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).
¿qué tipo de ..modal es?
Datos No Agrupados
calificacion <- sample(c('⭐️',
'⭐️⭐️',
'⭐️⭐️⭐️',
'⭐️⭐️⭐️⭐️',
'⭐️⭐️⭐️⭐️⭐')
, size= 30,
replace= TRUE,
prob = c(.1,.1,.2,.4,.2))
calificacion [1] "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"
[6] "⭐️⭐️" "⭐️⭐️" "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"
[11] "⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️⭐"
[16] "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️" "⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️⭐"
[21] "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️" "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️⭐"
[26] "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️" "⭐️⭐️" "⭐️⭐️⭐️" "⭐️"
# crear las clases (intervalos) para 1 y 2\\
table(calificacion)%>%
as_data_frame()%>%
dplyr::rename(fi=n)%>%
dplyr::rename(Xi=1)%>%
mutate(hi= (fi/sum(fi))*100)%>%
mutate(Fi= cumsum(fi))%>%
mutate(Hi= cumsum(hi))Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
tibble, or `as.data.frame()` to convert to a data frame.
# A tibble: 5 × 5
Xi fi hi Fi Hi
<chr> <int> <dbl> <int> <dbl>
1 ⭐️ 1 3.33 1 3.33
2 ⭐️⭐️ 3 10 4 13.3
3 ⭐️⭐️⭐️ 5 16.7 9 30
4 ⭐️⭐️⭐️⭐️ 12 40 21 70
5 ⭐️⭐️⭐️⭐️⭐ 9 30 30 100
df_calificacion <- table(calificacion)%>%
as_data_frame()%>%
dplyr::rename(fi=n)%>%
dplyr::rename(Xi=1)%>%
mutate(hi= (fi/sum(fi))*100)%>%
mutate(Fi= cumsum(fi))%>%
mutate(Hi= cumsum(hi))
df_calificacion# A tibble: 5 × 5
Xi fi hi Fi Hi
<chr> <int> <dbl> <int> <dbl>
1 ⭐️ 1 3.33 1 3.33
2 ⭐️⭐️ 3 10 4 13.3
3 ⭐️⭐️⭐️ 5 16.7 9 30
4 ⭐️⭐️⭐️⭐️ 12 40 21 70
5 ⭐️⭐️⭐️⭐️⭐ 9 30 30 100
library(plyr)calificacion2 <- as.numeric( mapvalues(calificacion,
c('⭐️','⭐️⭐️','⭐️⭐️⭐️','⭐️⭐️⭐️⭐️',"⭐️⭐️⭐️⭐️⭐"),
1:5))
calificacion2 [1] 4 4 4 5 4 2 2 5 5 4 3 4 3 4 5 4 3 3 4 5 5 4 4 5 5 5 4 2 3 1
sort(calificacion2) [1] 1 2 2 2 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
mean(calificacion2)[1] 3.833333
median(calificacion2)[1] 4
Mode(calificacion2)[1] 4
attr(,"freq")
[1] 12
hist(calificacion2, breaks = 4)