transpose data.frame and counting non NA values per column

I have this dataframe:

set.seed(100)
x <- data.frame(KAS1_1 = sample(c(letters[1], NA), 10, replace =TRUE),
                KAS1_2 = sample(c(letters[2], NA), 10, replace =TRUE),
                KAS1_3 = sample(c(letters[3], NA), 10, replace =TRUE),
                KAS1_4 = sample(c(letters[4], NA), 10, replace =TRUE),
                KAS1_5 = sample(c(letters[5], NA), 10, replace =TRUE),
                stringsAsFactors = FALSE)
> df
   KAS1_1 KAS1_2 KAS1_3 KAS1_4 KAS1_5
1       a   <NA>   <NA>      d      e
2       a   <NA>   <NA>   <NA>   <NA>
3    <NA>      b   <NA>      d   <NA>
4       a      b   <NA>   <NA>   <NA>
5       a   <NA>      c   <NA>   <NA>
6       a   <NA>      c   <NA>      e
7    <NA>      b   <NA>      d   <NA>
8       a      b   <NA>   <NA>   <NA>
9    <NA>      b   <NA>   <NA>      e
10      a   <NA>      c      d      e

And I'm looking for a way to get this:

   Var   Count
KAS1_1   8
KAS1_2   5
KAS1_3   3
KAS1_4   4
KAS1_5   4

I'm trying with dplyr using table and lapply() and t() with no results. Is there a more straightforward way?

2 answers

  • answered 2019-03-13 19:34 markus

    base R option

    stack(lapply(x, function(y) length(na.omit(y))))
    #  values    ind
    #1      7 KAS1_1
    #2      5 KAS1_2
    #3      3 KAS1_3
    #4      4 KAS1_4
    #5      4 KAS1_5
    

    Another tidyverse option besides Gregor's solution is

    library(dplyr); library(tidyr)
    gather(x, na.rm = TRUE) %>% count(key)
    

  • answered 2019-03-13 19:34 Gregor

    ## halfway there
    colSums(!is.na(x))
    # KAS1_1 KAS1_2 KAS1_3 KAS1_4 KAS1_5 
    #      7      5      3      4      4 
    
    ## make it a data frame
    data.frame(count = colSums(!is.na(x)))
    #        count
    # KAS1_1     7
    # KAS1_2     5
    # KAS1_3     3
    # KAS1_4     4
    # KAS1_5     4
    
    ## or use `stack` like markus's nice answer:
    stack(colSums(!is.na(x)))
    #   values    ind
    # 1      7 KAS1_1
    # 2      5 KAS1_2
    # 3      3 KAS1_3
    # 4      4 KAS1_4
    # 5      4 KAS1_5
    

    Converting the row names to their own column would be another step, but I'll leave that to you.

    The tidyverse solution would have you convert to long format and then do a grouped sum:

    library(dplyr)
    library(tidyr)
    x %>% gather %>%
      group_by(key) %>%
      summarize(value = sum(!is.na(value)))
    # # A tibble: 5 x 2
    #   key    value
    #   <chr>  <int>
    # 1 KAS1_1     7
    # 2 KAS1_2     5
    # 3 KAS1_3     3
    # 4 KAS1_4     4
    # 5 KAS1_5     4
    

    A data.table solution would be similar:

    library(data.table)
    xdt = as.data.table(x)
    melt(xdt, measure.vars = names(xdt))[, .(count = sum(!is.na(value))), by = .(variable)]
    #    variable count
    # 1:   KAS1_1     7
    # 2:   KAS1_2     5
    # 3:   KAS1_3     3
    # 4:   KAS1_4     4
    # 5:   KAS1_5     4