13 Case Study in Data Wrangling

13.1 Yeast Genomics

Smith and Kruglyak (2008) is a study that measured 2820 genotypes in 109 yeast F1 segregants from a cross between parental lines BY and RM.

They also measured gene expression on 4482 genes in each of these segregants when growing in two different Carbon sources, glucose and ethanol.

13.1.1 Load Data

The data was distributed as a collection of matrices in R.

> rm(list=ls())
> load("./data/smith_kruglyak.RData")
> ls()
[1] "exp.e"      "exp.g"      "exp.pos"    "marker"     "marker.pos"
> eapply(env=.GlobalEnv, dim)
$exp.e
[1] 4482  109

$exp.g
[1] 4482  109

$marker
[1] 2820  109

$exp.pos
[1] 4482    3

$marker.pos
[1] 2820    2

13.1.2 Gene Expression Matrices

> exp.g %>% cbind(rownames(exp.g), .) %>% as_tibble() %>% 
+   print()
Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
This warning is displayed once per session.
# A tibble: 4,482 x 110
   V1    X100g.20_4_c.gl… X101g.21_1_d.gl… X102g.21_2_d.gl…
   <chr> <chr>            <chr>            <chr>           
 1 YJR1… 0.22             0.18             0.05            
 2 YPL2… -0.29            -0.2             -0.19           
 3 YDR5… 0.72             0.04             0.26            
 4 YDR2… 0.23             0.31             0.12            
 5 YHR0… 0.4              -0.04            0.36            
 6 YFR0… -0.36            0.35             -0.26           
 7 YPL1… 0.23             -0.21            -0.25           
 8 YDR0… -0.09            0.57             0.24            
 9 YLR3… -0.23            0.13             -0.17           
10 YCR0… -0.25            -0.98            -0.3            
# … with 4,472 more rows, and 106 more variables:
#   X103g.21_3_d.glucose <chr>, X104g.21_4_d.glucose <chr>,
#   X105g.21_5_c.glucose <chr>, X106g.22_2_d.glucose <chr>,
#   X107g.22_3_b.glucose <chr>, X109g.22_5_d.glucose <chr>,
#   X10g.2_5_d.glucose <chr>, X110g.23_3_d.glucose <chr>,
#   X111g.23_5_d.glucose <chr>, X112g.24_1_d.glucose <chr>,
#   X113g.25_1_d.glucose <chr>, X114g.25_3_d.glucose <chr>,
#   X115g.25_4_d.glucose <chr>, X116g.26_1_d.glucose <chr>,
#   X117g.26_2_d.glucose <chr>, X11g.2_6_d.glucose <chr>,
#   X12g.2_7_a.glucose <chr>, X13g.3_1_d.glucose <chr>,
#   X15g.3_3_d.glucose <chr>, X16g.3_4_d.glucose <chr>,
#   X17g.3_5_d.glucose <chr>, X18g.4_1_c.glucose <chr>,
#   X1g.1_1_d.glucose <chr>, X20g.4_3_d.glucose <chr>,
#   X21g.4_4_d.glucose <chr>, X22g.5_1_d.glucose <chr>,
#   X23g.5_2_d.glucose <chr>, X24g.5_3_d.glucose <chr>,
#   X25g.5_4_d.glucose <chr>, X26g.5_5_d.glucose <chr>,
#   X27g.6_1_d.glucose <chr>, X28g.6_2_b.glucose <chr>,
#   X29g.6_3_c.glucose <chr>, X30g.6_4_d.glucose <chr>,
#   X31g.6_5_d.glucose <chr>, X32g.6_6_d.glucose <chr>,
#   X33g.6_7_d.glucose <chr>, X34g.7_1_d.glucose <chr>,
#   X35g.7_2_c.glucose <chr>, X36g.7_3_d.glucose <chr>,
#   X37g.7_4_c.glucose <chr>, X38g.7_5_d.glucose <chr>,
#   X39g.7_6_c.glucose <chr>, X3g.1_3_d.glucose <chr>,
#   X40g.7_7_c.glucose <chr>, X41g.7_8_d.glucose <chr>,
#   X42g.8_1_a.glucose <chr>, X43g.8_2_d.glucose <chr>,
#   X44g.8_3_a.glucose <chr>, X45g.8_4_c.glucose <chr>,
#   X46g.8_5_b.glucose <chr>, X47g.8_6_c.glucose <chr>,
#   X48g.8_7_b.glucose <chr>, X49g.9_1_d.glucose <chr>,
#   X4g.1_4_d.glucose <chr>, X50g.9_2_d.glucose <chr>,
#   X51g.9_3_d.glucose <chr>, X52g.9_4_d.glucose <chr>,
#   X53g.9_5_d.glucose <chr>, X54g.9_6_d.glucose <chr>,
#   X55g.9_7_d.glucose <chr>, X56g.10_1_c.glucose <chr>,
#   X57g.10_2_d.glucose <chr>, X58g.10_3_c.glucose <chr>,
#   X59g.10_4_d.glucose <chr>, X5g.1_5_c.glucose <chr>,
#   X60g.11_1_a.glucose <chr>, X61g.11_2_d.glucose <chr>,
#   X62g.11_3_b.glucose <chr>, X63g.12_1_d.glucose <chr>,
#   X64g.12_2_b.glucose <chr>, X65g.13_1_a.glucose <chr>,
#   X66g.13_2_c.glucose <chr>, X67g.13_3_b.glucose <chr>,
#   X68g.13_4_a.glucose <chr>, X69g.13_5_c.glucose <chr>,
#   X70g.14_1_b.glucose <chr>, X71g.14_2_c.glucose <chr>,
#   X73g.14_4_a.glucose <chr>, X74g.14_5_b.glucose <chr>,
#   X75g.14_6_d.glucose <chr>, X76g.14_7_c.glucose <chr>,
#   X77g.15_2_d.glucose <chr>, X78g.15_3_b.glucose <chr>,
#   X79g.15_4_d.glucose <chr>, X7g.2_2_d.glucose <chr>,
#   X80g.15_5_b.glucose <chr>, X82g.16_1_d.glucose <chr>,
#   X83g.17_1_a.glucose <chr>, X84g.17_2_d.glucose <chr>,
#   X85g.17_4_a.glucose <chr>, X86g.17_5_b.glucose <chr>,
#   X87g.18_1_d.glucose <chr>, X88g.18_2_d.glucose <chr>,
#   X89g.18_3_d.glucose <chr>, X8g.2_3_d.glucose <chr>,
#   X90g.18_4_c.glucose <chr>, X92g.19_1_c.glucose <chr>,
#   X93g.19_2_c.glucose <chr>, X94g.19_3_c.glucose <chr>, …

13.1.3 Gene Position Matrix

> exp.pos %>% cbind(rownames(exp.pos), .) %>% as_tibble() %>% 
+   print()
# A tibble: 4,482 x 4
   V1      Chromsome Start_coord End_coord
   <chr>   <chr>     <chr>       <chr>    
 1 YJR107W 10        627333      628319   
 2 YPL270W 16        30482       32803    
 3 YDR518W 4         1478600     1480153  
 4 YDR233C 4         930353      929466   
 5 YHR098C 8         301937      299148   
 6 YFR029W 6         210925      212961   
 7 YPL198W 16        173151      174701   
 8 YDR001C 4         452472      450217   
 9 YLR394W 12        907950      909398   
10 YCR079W 3         252842      254170   
# … with 4,472 more rows

13.1.4 Row Names

The gene names are contained in the row names.

> head(rownames(exp.g))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> head(rownames(exp.e))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> head(rownames(exp.pos))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> all.equal(rownames(exp.g), rownames(exp.e))
[1] TRUE
> all.equal(rownames(exp.g), rownames(exp.pos))
[1] TRUE

13.1.5 Unify Column Names

The segregants are column names, and they are inconsistent across matrices.

> head(colnames(exp.g))
[1] "X100g.20_4_c.glucose" "X101g.21_1_d.glucose" "X102g.21_2_d.glucose"
[4] "X103g.21_3_d.glucose" "X104g.21_4_d.glucose" "X105g.21_5_c.glucose"
> head(colnames(marker))
[1] "20_4_c" "21_1_d" "21_2_d" "21_3_d" "21_4_d" "21_5_c"
>      
> ##fix column names with gsub
> colnames(exp.g) %<>% strsplit(split=".", fixed=TRUE) %>%
+   lapply(function(x) {x[2]})
> colnames(exp.e) %<>% strsplit(split=".", fixed=TRUE) %>%
+   lapply(function(x) {x[2]})
> head(colnames(exp.g))
[1] "20_4_c" "21_1_d" "21_2_d" "21_3_d" "21_4_d" "21_5_c"

13.1.6 Gene Positions

Let’s first pull out rownames of exp.pos and make them a column in the data frame.

> gene_pos <- exp.pos %>% as_tibble() %>%
+   mutate(gene = rownames(exp.pos)) %>%
+   dplyr::select(gene, chr = Chromsome, start = Start_coord, 
+                 end = End_coord)
> print(gene_pos, n=7)
# A tibble: 4,482 x 4
  gene      chr   start     end
  <chr>   <int>   <int>   <int>
1 YJR107W    10  627333  628319
2 YPL270W    16   30482   32803
3 YDR518W     4 1478600 1480153
4 YDR233C     4  930353  929466
5 YHR098C     8  301937  299148
6 YFR029W     6  210925  212961
7 YPL198W    16  173151  174701
# … with 4,475 more rows

13.1.7 Tidy Each Expression Matrix

We melt the expression matrices and bind them together into one big tidy data frame.

> exp_g <- melt(exp.g) %>% as_tibble() %>% 
+   dplyr::select(gene = Var1, segregant = Var2, 
+                 expression = value) %>%
+   mutate(condition = "glucose")
> exp_e <- melt(exp.e) %>% as_tibble() %>% 
+   dplyr::select(gene = Var1, segregant = Var2, 
+                 expression = value) %>%
+   mutate(condition = "ethanol")
> print(exp_e, n=4)
# A tibble: 488,538 x 4
  gene    segregant expression condition
  <fct>   <fct>          <dbl> <chr>    
1 YJR107W 20_4_c          0.06 ethanol  
2 YPL270W 20_4_c         -0.13 ethanol  
3 YDR518W 20_4_c         -0.94 ethanol  
4 YDR233C 20_4_c          0.04 ethanol  
# … with 4.885e+05 more rows

13.1.8 Combine Into Single Data Frame

Combine gene expression data from two conditions into a single data frame.

> exp_all <- bind_rows(exp_g, exp_e)
> sample_n(exp_all, size=10)
# A tibble: 10 x 4
   gene    segregant expression condition
   <fct>   <fct>          <dbl> <chr>    
 1 YBL087C 21_4_d         -0.72 ethanol  
 2 YDR524C 21_2_d         -0.17 glucose  
 3 YGR067C 9_1_d          -3.92 glucose  
 4 YHR207C 26_1_d         -0.43 ethanol  
 5 YDR329C 20_2_d         -0.06 glucose  
 6 YGL121C 8_7_b           1    ethanol  
 7 YJR044C 3_3_d          -0.12 ethanol  
 8 YIL088C 2_7_a           0.1  ethanol  
 9 YML127W 5_1_d          -0.08 ethanol  
10 YMR304W 6_1_d           0.2  ethanol

13.1.9 Join Gene Positions

Now we want to join the gene positions with the expression data.

> exp_all <- exp_all %>%
+   mutate(gene = as.character(gene), 
+          segregant = as.character(segregant))
> sk_tidy <- exp_all %>%
+   left_join(gene_pos, by = "gene")
> sample_n(sk_tidy, size=7)
# A tibble: 7 x 7
  gene    segregant expression condition   chr  start    end
  <chr>   <chr>          <dbl> <chr>     <int>  <int>  <int>
1 YGL189C 1_3_d          -0.26 ethanol       7 148594 148235
2 YBR257W 13_2_c          0.02 ethanol       2 728880 729719
3 YER098W 21_1_d          0.46 ethanol       5 355462 357726
4 YCR035C 9_1_d           0.07 glucose       3 193014 191830
5 YBR097W 17_5_b         -0.03 glucose       2 436945 441309
6 YBR235W 8_4_c          -0.18 ethanol       2 686896 690258
7 YJL094C 14_6_d          0    glucose      10 254437 251816

13.1.10 Apply `dplyr` Functions

Now that we have the data made tidy in the data frame sk_tidy, let’s apply some dplyr operations…

Does each gene have the same number of observations?

> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   summary()
     gene               value      
 Length:4478        Min.   :218.0  
 Class :character   1st Qu.:218.0  
 Mode  :character   Median :218.0  
                    Mean   :218.6  
                    3rd Qu.:218.0  
                    Max.   :872.0

No, so let’s see which genes have more than one set of observations.

> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   filter(value > median(value))
# A tibble: 4 x 2
  gene      value
  <chr>     <int>
1 YFR024C-A   872
2 YJL012C     872
3 YKL198C     872
4 YPR089W     872

Let’s remove replicated measurements for these genes.

> sk_tidy %<>% distinct(gene, segregant, condition, 
+                       .keep_all = TRUE)
> 
> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   summary()
     gene               value    
 Length:4478        Min.   :218  
 Class :character   1st Qu.:218  
 Mode  :character   Median :218  
                    Mean   :218  
                    3rd Qu.:218  
                    Max.   :218

As an exercise, think about how you would use dplyr to replace the replicated gene expression values with a single averaged expression value for these genes.

Get the mean and standard deviation expression per chromosome.

> sk_tidy %>%
+   group_by(chr) %>%
+   summarize(mean = mean(expression), sd=sd(expression))
# A tibble: 16 x 3
     chr    mean    sd
   <int>   <dbl> <dbl>
 1     1 -0.0762 0.826
 2     2 -0.0447 0.632
 3     3 -0.0230 0.682
 4     4 -0.0233 0.537
 5     5 -0.0579 0.610
 6     6 -0.0772 0.660
 7     7 -0.0441 0.617
 8     8 -0.0474 0.638
 9     9 -0.0430 0.614
10    10 -0.0299 0.570
11    11 -0.0396 0.613
12    12 -0.0515 0.643
13    13 -0.0265 0.584
14    14 -0.0294 0.642
15    15 -0.0130 0.554
16    16 -0.0368 0.604

Get the mean and standard deviation expression per chromosome in each condition.

> sk_tidy %>%
+   group_by(chr, condition) %>%
+   summarize(mean = mean(expression), sd=sd(expression))
# A tibble: 32 x 4
# Groups:   chr [?]
     chr condition      mean    sd
   <int> <chr>         <dbl> <dbl>
 1     1 ethanol    0.0260   0.480
 2     1 glucose   -0.178    1.05 
 3     2 ethanol    0.0132   0.479
 4     2 glucose   -0.103    0.750
 5     3 ethanol    0.000164 0.536
 6     3 glucose   -0.0461   0.800
 7     4 ethanol    0.00187  0.482
 8     4 glucose   -0.0484   0.586
 9     5 ethanol   -0.0297   0.479
10     5 glucose   -0.0862   0.716
# … with 22 more rows

Count the number of genes per chromosome.

> sk_tidy %>%
+   filter(condition == "glucose", segregant == "20_4_c") %>%
+   group_by(chr) %>% 
+   summarize(num.genes = n())
# A tibble: 16 x 2
     chr num.genes
   <int>     <int>
 1     1        60
 2     2       298
 3     3       125
 4     4       629
 5     5       207
 6     6        79
 7     7       395
 8     8       209
 9     9       152
10    10       256
11    11       241
12    12       387
13    13       367
14    14       319
15    15       388
16    16       366

Filter for the first gene on every chromosome.

> sk_tidy %>%
+   filter(condition == "glucose", segregant == "20_4_c") %>%
+   group_by(chr) %>%
+   filter(start == min(start))
# A tibble: 16 x 7
# Groups:   chr [16]
   gene    segregant expression condition   chr start   end
   <chr>   <chr>          <dbl> <chr>     <int> <int> <int>
 1 YHL040C 20_4_c        -2.79  glucose       8 20968 19085
 2 YNL334C 20_4_c        -0.9   glucose      14 12876 12208
 3 YOL157C 20_4_c        -1.06  glucose      15 24293 22524
 4 YKL222C 20_4_c         0.09  glucose      11  5621  3504
 5 YIL168W 20_4_c        -1.14  glucose       9 29032 29415
 6 YJL213W 20_4_c         0.84  glucose      10 32163 33158
 7 YPL272C 20_4_c        -0.18  glucose      16 28164 26611
 8 YLL063C 20_4_c        -0.66  glucose      12 16072 14648
 9 YFL048C 20_4_c        -0.09  glucose       6 40180 38843
10 YML132W 20_4_c        -0.21  glucose      13  7244  8383
11 YGL261C 20_4_c        -0.14  glucose       7  6652  6290
12 YBL107C 20_4_c         0.290 glucose       2 10551  9961
13 YDL248W 20_4_c        -0.68  glucose       4  1802  2953
14 YEL073C 20_4_c        -0.02  glucose       5  7553  7230
15 YAL062W 20_4_c        -5.64  glucose       1 31568 32941
16 YCL068C 20_4_c         0.47  glucose       3 12285 11503

To plot expression in glucose versus ethanol we first need to use dcast().

> sk_tidy %>% dcast(gene + segregant ~ condition, 
+                   value.var = "expression") %>%
+   as_tibble()
# A tibble: 488,102 x 4
   gene    segregant ethanol glucose
   <chr>   <chr>       <dbl>   <dbl>
 1 YAL002W 1_1_d        0.37   -0.01
 2 YAL002W 1_3_d        0.23    0.03
 3 YAL002W 1_4_d        0.08    0.07
 4 YAL002W 1_5_c       -0.12    0.13
 5 YAL002W 10_1_c       0.12   -0.1 
 6 YAL002W 10_2_d       0.1    -0.2 
 7 YAL002W 10_3_c       0.07   -0.15
 8 YAL002W 10_4_d       0.06   -0.04
 9 YAL002W 11_1_a       0.07   -0.07
10 YAL002W 11_2_d       0.3     0.1 
# … with 488,092 more rows

> sk_tidy %>% dcast(gene + segregant ~ condition, 
+                   value.var = "expression") %>%
+   filter(gene == "YAL002W") %>%
+   ggplot(aes(x = glucose, y = ethanol)) +
+   geom_point() +  theme_bw() +
+   theme(legend.position = "none")