clase 08 apoyo

library(ggplot2)
library(dplyr)
library(openintro)
library(DescTools)

Conjunto de Datos Loan 50

Variable a estudiar “interest_rate”

Obtener Estadísticos Loan 50. Interest Rate

loan50$interest_rate

 [1] 10.90  9.92 26.30  9.92  9.43  9.92 17.09  6.08  7.97 12.62 17.09  5.31
[13]  7.35  5.31  7.96 24.85 18.06 10.42  7.96 19.42 14.08 20.00  9.44  9.92
[25] 10.91  5.32  6.71 15.04 11.98 12.62 10.91  9.44  9.93  7.35 18.45 17.09
[37]  7.96  6.08  6.71  7.34 12.62 16.02 10.90  9.93  9.44 10.42 21.45 10.91
[49]  9.43  6.08

sort(loan50$interest_rate)

 [1]  5.31  5.31  5.32  6.08  6.08  6.08  6.71  6.71  7.34  7.35  7.35  7.96
[13]  7.96  7.96  7.97  9.43  9.43  9.44  9.44  9.44  9.92  9.92  9.92  9.92
[25]  9.93  9.93 10.42 10.42 10.90 10.90 10.91 10.91 10.91 11.98 12.62 12.62
[37] 12.62 14.08 15.04 16.02 17.09 17.09 17.09 18.06 18.45 19.42 20.00 21.45
[49] 24.85 26.30

max(loan50$interest_rate)

[1] 26.3

min(loan50$interest_rate)

[1] 5.31

abs(max(loan50$interest_rate)- min(loan50$interest_rate))

[1] 20.99

mean(loan50$interest_rate)

[1] 11.5672

median(loan50$interest_rate)

[1] 9.93

Mode(loan50$interest_rate)

[1] 9.92
attr(,"freq")
[1] 4

Dot Plot (gráfico de puntos) Loan 50

plot(x= loan50$interest_rate, 
     y=rep(0,length(loan50$interest_rate)),
     main='Plot de puntos: Tasa de interés',
     xlab='%',
     ylab='',
     pch= 1)

par(new=TRUE)
points(mean(loan50$interest_rate),
       -.05, 
       pch = 17, 
       col = "red",
       cex = 2)

Nota: triángulo rojo representa el promedio.

Se usa función plot y se superponen dos gráficos

Modificar Conjunto de Datos Loan 50

Redondear variable interest_rate sin décimales

loan50_redondeado <- loan50%>%
  mutate(interest_rate= round(interest_rate,0 ))

head(loan50_redondeado[,c(1,10:14)])

# A tibble: 6 × 6
  state num_cc_carrying_balance loan_purpose     loan_amount grade interest_rate
  <fct>                   <int> <fct>                  <int> <fct>         <dbl>
1 NJ                          8 debt_consolidat…       22000 B                11
2 CA                          2 credit_card             6000 B                10
3 SC                         14 debt_consolidat…       25000 E                26
4 CA                         10 credit_card             6000 B                10
5 OH                          2 home_improvement       25000 B                 9
6 IN                          4 home_improvement        6400 B                10

Obtener Estadísticos Loan 50 redondeado. Interest Rate

loan50_redondeado$interest_rate

 [1] 11 10 26 10  9 10 17  6  8 13 17  5  7  5  8 25 18 10  8 19 14 20  9 10 11
[26]  5  7 15 12 13 11  9 10  7 18 17  8  6  7  7 13 16 11 10  9 10 21 11  9  6

sort(loan50_redondeado$interest_rate)

 [1]  5  5  5  6  6  6  7  7  7  7  7  8  8  8  8  9  9  9  9  9 10 10 10 10 10
[26] 10 10 10 11 11 11 11 11 12 13 13 13 14 15 16 17 17 17 18 18 19 20 21 25 26

max(loan50_redondeado$interest_rate)

[1] 26

min(loan50_redondeado$interest_rate)

[1] 5

abs(max(loan50_redondeado$interest_rate)-min(loan50_redondeado$interest_rate))

[1] 21

mean(loan50_redondeado$interest_rate)

[1] 11.48

median(loan50_redondeado$interest_rate)

[1] 10

Mode(loan50_redondeado$interest_rate)

[1] 10
attr(,"freq")
[1] 8

Dot Plot Stacked (apilado)

ggplot(loan50_redondeado, 
       aes(x = interest_rate)) + 
  geom_dotplot()+
  scale_y_continuous(NULL, breaks = NULL)

Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

¿Cuál es la moda?
¿Regiones de mayor densidad?

Dot Plot Stacked con media

ggplot(loan50_redondeado, 
       aes(x = interest_rate)) + 
  geom_dotplot()+
  scale_y_continuous(NULL, breaks = NULL)+
  annotate("pointrange",
           x = mean(loan50$interest_rate),
           y = -.01, 
           ymin = -.01,
           ymax = .2,
           colour = "red")

Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Dot Plot Stacked con Límites

ggplot(loan50_redondeado, 
       aes(x = interest_rate)) + 
  geom_dotplot()+
  scale_y_continuous(NULL, breaks = NULL)+
  annotate("pointrange",
         x = min(loan50_redondeado$interest_rate),
         y = -.01, 
         ymin = -.01,
         ymax = .5,
         colour = "blue")+
  annotate("pointrange",
           x = max(loan50_redondeado$interest_rate),
           y = -.01, 
           ymin = -.01,
           ymax = .5,
           colour = "blue")

Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Rango Loan 50 redondeado

max(loan50_redondeado$interest_rate)-min(loan50_redondeado$interest_rate)

[1] 21

Sesgos en Loan 50 Redondeado

ggplot(data= loan50_redondeado, aes(x=interest_rate))+
  geom_histogram(bins=8, 
                 fill='blue')

ggplot(data= loan50_redondeado, aes(x=interest_rate))+
  geom_histogram(bins=12, 
                 fill='blue')

Estadísticos Loan 50 -II

sd(loan50$interest_rate)

[1] 5.052115

summary(loan50$interest_rate)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.31    7.96    9.93   11.57   13.71   26.30

Conjunto de Datos Babies

Selección muestra aleatória variable “age”

set.seed(5313)
babies_subset <- babies%>%
  sample_n(50)

Estadísticos babies_subset age

babies_subset$age

 [1] 30 20 25 30 25 37 37 39 24 26 29 22 27 26 26 23 38 25 30 21 24 23 22 23 20
[26] 31 25 33 31 35 19 24 19 25 40 21 21 39 25 24 39 24 31 26 26 37 22 37 36 19

min(babies_subset$age)

[1] 19

max(babies_subset$age)

[1] 40

abs(max(babies_subset$age)-min(babies_subset$age))

[1] 21

summary(babies_subset$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   23.00   25.50   27.62   31.00   40.00

mean(babies_subset$age)

[1] 27.62

median(babies_subset$age)

[1] 25.5

Dot Plot Stacked babies_subset age

ggplot(data= babies_subset) + 
  geom_dotplot(aes(x = age))+
  scale_y_continuous(NULL, breaks = NULL)

Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

ggplot(data= babies_subset) +
  geom_dotplot(aes(x = age),
               fill = "steelblue") + 
  scale_y_continuous(NULL, breaks = NULL)+
  annotate('point',
           x=mean(babies_subset$age),
           y=-.01,
           col='red')

Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Variaciones en Bins del Histográma babies_subset

ggplot(data=babies_subset, 
       aes(x=age))+
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=babies_subset, 
       aes(x=age))+
  geom_histogram(bins = 15)

ggplot(data=babies_subset, 
       aes(x=age))+
  geom_histogram(bins = 10)

ggplot(data=babies, 
       aes(x=age))+
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

¿qué tipo de ..modal es?

Datos No Agrupados

calificacion <- sample(c('⭐️',
                          '⭐️⭐️',
                          '⭐️⭐️⭐️',
                          '⭐️⭐️⭐️⭐️',
                          '⭐️⭐️⭐️⭐️⭐')
                       , size= 30, 
                       replace= TRUE,
                       prob = c(.1,.1,.2,.4,.2))
calificacion

 [1] "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"  
 [6] "⭐️⭐️"       "⭐️⭐️"       "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"  
[11] "⭐️⭐️⭐️"     "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️"     "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️⭐"
[16] "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️"     "⭐️⭐️⭐️"     "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️⭐"
[21] "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️"   "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️⭐"
[26] "⭐️⭐️⭐️⭐️⭐" "⭐️⭐️⭐️⭐️"   "⭐️⭐️"       "⭐️⭐️⭐️"     "⭐️"

# crear las clases (intervalos) para 1 y 2\\

table(calificacion)%>%
  as_data_frame()%>%
  dplyr::rename(fi=n)%>%
  dplyr::rename(Xi=1)%>%
  mutate(hi= (fi/sum(fi))*100)%>%
  mutate(Fi= cumsum(fi))%>%
  mutate(Hi= cumsum(hi))

Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
  tibble, or `as.data.frame()` to convert to a data frame.

# A tibble: 5 × 5
  Xi            fi    hi    Fi     Hi
  <chr>      <int> <dbl> <int>  <dbl>
1 ⭐️             1  3.33     1   3.33
2 ⭐️⭐️           3 10        4  13.3 
3 ⭐️⭐️⭐️         5 16.7      9  30   
4 ⭐️⭐️⭐️⭐️      12 40       21  70   
5 ⭐️⭐️⭐️⭐️⭐     9 30       30 100

df_calificacion <- table(calificacion)%>%
  as_data_frame()%>%
  dplyr::rename(fi=n)%>%
  dplyr::rename(Xi=1)%>%
  mutate(hi= (fi/sum(fi))*100)%>%
  mutate(Fi= cumsum(fi))%>%
  mutate(Hi= cumsum(hi))

df_calificacion

# A tibble: 5 × 5
  Xi            fi    hi    Fi     Hi
  <chr>      <int> <dbl> <int>  <dbl>
1 ⭐️             1  3.33     1   3.33
2 ⭐️⭐️           3 10        4  13.3 
3 ⭐️⭐️⭐️         5 16.7      9  30   
4 ⭐️⭐️⭐️⭐️      12 40       21  70   
5 ⭐️⭐️⭐️⭐️⭐     9 30       30 100

library(plyr)

calificacion2 <- as.numeric( mapvalues(calificacion,
                                       c('⭐️','⭐️⭐️','⭐️⭐️⭐️','⭐️⭐️⭐️⭐️',"⭐️⭐️⭐️⭐️⭐"),
                                       1:5))


calificacion2

 [1] 4 4 4 5 4 2 2 5 5 4 3 4 3 4 5 4 3 3 4 5 5 4 4 5 5 5 4 2 3 1

sort(calificacion2)

 [1] 1 2 2 2 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5

mean(calificacion2)

[1] 3.833333

median(calificacion2)

[1] 4

Mode(calificacion2)

[1] 4
attr(,"freq")
[1] 12

hist(calificacion2, breaks = 4)