R_How can I fill blanks in one column with mean of two other columns?

I am trying to fill blanks in var1 with the mean of var2 and var3, but I can't get it to work. This is what I've tried so far:

df <- data.frame(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3))
df$var2 <- c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6)
df$var3 <- c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)


 for(i in 1:length(df$var1)) {
   ifelse(is.na(df$var1[i]), df$var1[i] <- mean(df$var2[i], df$var3[i]), df$var1[i] == df$var1[i])
 }

I am not sure what I am doing wrong. After running the code, var1 still shows empty cells.

Thank you very much for your help

3 answers

  • answered 2018-10-11 19:53 ebb

    Try this:

    df <- data.frame(var1 = c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
                     var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
                     var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
                     stringsAsFactors = FALSE)
    df[df==""] <- "NA"
    
    for (i in 1:length(df$var1)) {
      if (df$var1[i]== "NA") {
        df$var1[i] = rowMeans(df[i, 2:3])
      } else {
        df$var1[i] = df$var1[i]
      }
    }
    

    Or:

    for (i in 1:length(df[,1])) {
      ifelse (df[i,1] == "NA", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
    }  
    

    Alternatively, instead of redefining blanks as "NA" (as text in the example above), you could leave it as blank, skiping the df[df==""] <- "NA" bit:

    for (i in 1:length(df[,1])) {
      ifelse (df[i,1] == "", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
    } 
    

    Or identify blanks as "real" NAs:

    df[df==""] <- NA
    
    for (i in 1:length(df[,1])) {
      ifelse (is.na(df[i,1]), df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
    }
    

  • answered 2018-10-11 20:42 AndS.

    Another way without any loops:

    library(dplyr)
    
    df %>% 
      mutate_at(vars(var1:var3), as.numeric) %>%
      mutate(var1 = case_when(is.na(var1) ~ (var2+var3)/2, TRUE ~ var1))
    #>    var1 var2 var3
    #> 1   1.0    1    4
    #> 2   2.0    8    1
    #> 3   5.0    9    1
    #> 4   3.0    1    4
    #> 5   3.0    1    4
    #> 6   5.5    5    6
    #> 7   7.5    8    7
    #> 8   2.0    8    8
    #> 9   2.0    3    9
    #> 10  6.0    2   10
    #> 11  7.0    0   11
    #> 12  3.0    9   12
    #> 13  8.5    4   13
    #> 14  9.0    4   14
    #> 15 11.0    7   15
    #> 16  3.0    3   16
    #> 17  3.0    5   17
    #> 18 11.0    5   18
    #> 19 12.0    2   19
    #> 20  2.0    4   20
    #> 21 13.5    6   21
    #> 22  3.0    6   22
    

  • answered 2018-10-11 23:28 mmn

    I would use a data.table approach here. It should work well with larger data and it avoids looping over your data, where you dont need it.

    library(data.table)
    dt <- data.table(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
                      var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
                      var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22))
    dt[, var1 := as.numeric(var1)]
    dt[is.na(var1), var1 := apply(.SD, 1, mean), .SDcols =c("var2", "var3")]
    
    dt
    
    
        var1 var2 var3
     1:  1.0    1    4
     2:  2.0    8    1
     3:  5.0    9    1
     4:  3.0    1    4
     5:  3.0    1    4
     6:  5.5    5    6
     7:  7.5    8    7
     8:  2.0    8    8
     9:  2.0    3    9
    10:  6.0    2   10
    11:  7.0    0   11
    12:  3.0    9   12
    13:  8.5    4   13
    14:  9.0    4   14
    15: 11.0    7   15
    16:  3.0    3   16
    17:  3.0    5   17
    18: 11.0    5   18
    19: 12.0    2   19
    20:  2.0    4   20
    21: 13.5    6   21
    22:  3.0    6   22