10 Case Study in Data Wrangling

10.1 Yeast Genomics

Smith and Kruglyak (2008) is a study that measured 2820 genotypes in 109 yeast F1 segregants from a cross between parental lines BY and RM.

They also measured gene expression on 4482 genes in each of these segregants when growing in two different Carbon sources, glucose and ethanol.

10.1.1 Load Data

The data was distributed as a collection of matrices in R.

> rm(list=ls())
> load("./data/smith_kruglyak.RData")
> ls()
[1] "exp.e"      "exp.g"      "exp.pos"    "marker"     "marker.pos"
> eapply(env=.GlobalEnv, dim)
$exp.e
[1] 4482  109

$exp.g
[1] 4482  109

$marker
[1] 2820  109

$exp.pos
[1] 4482    3

$marker.pos
[1] 2820    2

10.1.2 Gene Expression Matrices

> exp.g %>% cbind(rownames(exp.g), .) %>% as_tibble() %>% 
+   print()
Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
This warning is displayed once per session.
# A tibble: 4,482 x 110
   V1    X100g.20_4_c.gl… X101g.21_1_d.gl… X102g.21_2_d.gl…
   <chr> <chr>            <chr>            <chr>           
 1 YJR1… 0.22             0.18             0.05            
 2 YPL2… -0.29            -0.2             -0.19           
 3 YDR5… 0.72             0.04             0.26            
 4 YDR2… 0.23             0.31             0.12            
 5 YHR0… 0.4              -0.04            0.36            
 6 YFR0… -0.36            0.35             -0.26           
 7 YPL1… 0.23             -0.21            -0.25           
 8 YDR0… -0.09            0.57             0.24            
 9 YLR3… -0.23            0.13             -0.17           
10 YCR0… -0.25            -0.98            -0.3            
# … with 4,472 more rows, and 106 more variables:
#   X103g.21_3_d.glucose <chr>, X104g.21_4_d.glucose <chr>,
#   X105g.21_5_c.glucose <chr>, X106g.22_2_d.glucose <chr>,
#   X107g.22_3_b.glucose <chr>, X109g.22_5_d.glucose <chr>,
#   X10g.2_5_d.glucose <chr>, X110g.23_3_d.glucose <chr>,
#   X111g.23_5_d.glucose <chr>, X112g.24_1_d.glucose <chr>,
#   X113g.25_1_d.glucose <chr>, X114g.25_3_d.glucose <chr>,
#   X115g.25_4_d.glucose <chr>, X116g.26_1_d.glucose <chr>,
#   X117g.26_2_d.glucose <chr>, X11g.2_6_d.glucose <chr>,
#   X12g.2_7_a.glucose <chr>, X13g.3_1_d.glucose <chr>,
#   X15g.3_3_d.glucose <chr>, X16g.3_4_d.glucose <chr>,
#   X17g.3_5_d.glucose <chr>, X18g.4_1_c.glucose <chr>,
#   X1g.1_1_d.glucose <chr>, X20g.4_3_d.glucose <chr>,
#   X21g.4_4_d.glucose <chr>, X22g.5_1_d.glucose <chr>,
#   X23g.5_2_d.glucose <chr>, X24g.5_3_d.glucose <chr>,
#   X25g.5_4_d.glucose <chr>, X26g.5_5_d.glucose <chr>,
#   X27g.6_1_d.glucose <chr>, X28g.6_2_b.glucose <chr>,
#   X29g.6_3_c.glucose <chr>, X30g.6_4_d.glucose <chr>,
#   X31g.6_5_d.glucose <chr>, X32g.6_6_d.glucose <chr>,
#   X33g.6_7_d.glucose <chr>, X34g.7_1_d.glucose <chr>,
#   X35g.7_2_c.glucose <chr>, X36g.7_3_d.glucose <chr>,
#   X37g.7_4_c.glucose <chr>, X38g.7_5_d.glucose <chr>,
#   X39g.7_6_c.glucose <chr>, X3g.1_3_d.glucose <chr>,
#   X40g.7_7_c.glucose <chr>, X41g.7_8_d.glucose <chr>,
#   X42g.8_1_a.glucose <chr>, X43g.8_2_d.glucose <chr>,
#   X44g.8_3_a.glucose <chr>, X45g.8_4_c.glucose <chr>,
#   X46g.8_5_b.glucose <chr>, X47g.8_6_c.glucose <chr>,
#   X48g.8_7_b.glucose <chr>, X49g.9_1_d.glucose <chr>,
#   X4g.1_4_d.glucose <chr>, X50g.9_2_d.glucose <chr>,
#   X51g.9_3_d.glucose <chr>, X52g.9_4_d.glucose <chr>,
#   X53g.9_5_d.glucose <chr>, X54g.9_6_d.glucose <chr>,
#   X55g.9_7_d.glucose <chr>, X56g.10_1_c.glucose <chr>,
#   X57g.10_2_d.glucose <chr>, X58g.10_3_c.glucose <chr>,
#   X59g.10_4_d.glucose <chr>, X5g.1_5_c.glucose <chr>,
#   X60g.11_1_a.glucose <chr>, X61g.11_2_d.glucose <chr>,
#   X62g.11_3_b.glucose <chr>, X63g.12_1_d.glucose <chr>,
#   X64g.12_2_b.glucose <chr>, X65g.13_1_a.glucose <chr>,
#   X66g.13_2_c.glucose <chr>, X67g.13_3_b.glucose <chr>,
#   X68g.13_4_a.glucose <chr>, X69g.13_5_c.glucose <chr>,
#   X70g.14_1_b.glucose <chr>, X71g.14_2_c.glucose <chr>,
#   X73g.14_4_a.glucose <chr>, X74g.14_5_b.glucose <chr>,
#   X75g.14_6_d.glucose <chr>, X76g.14_7_c.glucose <chr>,
#   X77g.15_2_d.glucose <chr>, X78g.15_3_b.glucose <chr>,
#   X79g.15_4_d.glucose <chr>, X7g.2_2_d.glucose <chr>,
#   X80g.15_5_b.glucose <chr>, X82g.16_1_d.glucose <chr>,
#   X83g.17_1_a.glucose <chr>, X84g.17_2_d.glucose <chr>,
#   X85g.17_4_a.glucose <chr>, X86g.17_5_b.glucose <chr>,
#   X87g.18_1_d.glucose <chr>, X88g.18_2_d.glucose <chr>,
#   X89g.18_3_d.glucose <chr>, X8g.2_3_d.glucose <chr>,
#   X90g.18_4_c.glucose <chr>, X92g.19_1_c.glucose <chr>,
#   X93g.19_2_c.glucose <chr>, X94g.19_3_c.glucose <chr>, …

10.1.3 Gene Position Matrix

> exp.pos %>% cbind(rownames(exp.pos), .) %>% as_tibble() %>% 
+   print()
# A tibble: 4,482 x 4
   V1      Chromsome Start_coord End_coord
   <chr>   <chr>     <chr>       <chr>    
 1 YJR107W 10        627333      628319   
 2 YPL270W 16        30482       32803    
 3 YDR518W 4         1478600     1480153  
 4 YDR233C 4         930353      929466   
 5 YHR098C 8         301937      299148   
 6 YFR029W 6         210925      212961   
 7 YPL198W 16        173151      174701   
 8 YDR001C 4         452472      450217   
 9 YLR394W 12        907950      909398   
10 YCR079W 3         252842      254170   
# … with 4,472 more rows

10.1.4 Row Names

The gene names are contained in the row names.

> head(rownames(exp.g))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> head(rownames(exp.e))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> head(rownames(exp.pos))
[1] "YJR107W" "YPL270W" "YDR518W" "YDR233C" "YHR098C" "YFR029W"
> all.equal(rownames(exp.g), rownames(exp.e))
[1] TRUE
> all.equal(rownames(exp.g), rownames(exp.pos))
[1] TRUE

10.1.5 Unify Column Names

The segregants are column names, and they are inconsistent across matrices.

> head(colnames(exp.g))
[1] "X100g.20_4_c.glucose" "X101g.21_1_d.glucose" "X102g.21_2_d.glucose"
[4] "X103g.21_3_d.glucose" "X104g.21_4_d.glucose" "X105g.21_5_c.glucose"
> head(colnames(marker))
[1] "20_4_c" "21_1_d" "21_2_d" "21_3_d" "21_4_d" "21_5_c"
>      
> ##fix column names with gsub
> colnames(exp.g) %<>% strsplit(split=".", fixed=TRUE) %>%
+   lapply(function(x) {x[2]})
> colnames(exp.e) %<>% strsplit(split=".", fixed=TRUE) %>%
+   lapply(function(x) {x[2]})
> head(colnames(exp.g))
[1] "20_4_c" "21_1_d" "21_2_d" "21_3_d" "21_4_d" "21_5_c"

10.1.6 Gene Positions

Let’s first pull out rownames of exp.pos and make them a column in the data frame.

> gene_pos <- exp.pos %>% as_tibble() %>%
+   mutate(gene = rownames(exp.pos)) %>%
+   dplyr::select(gene, chr = Chromsome, start = Start_coord, 
+                 end = End_coord)
> print(gene_pos, n=7)
# A tibble: 4,482 x 4
  gene      chr   start     end
  <chr>   <int>   <int>   <int>
1 YJR107W    10  627333  628319
2 YPL270W    16   30482   32803
3 YDR518W     4 1478600 1480153
4 YDR233C     4  930353  929466
5 YHR098C     8  301937  299148
6 YFR029W     6  210925  212961
7 YPL198W    16  173151  174701
# … with 4,475 more rows

10.1.7 Tidy Each Expression Matrix

We melt the expression matrices and bind them together into one big tidy data frame.

> exp_g <- melt(exp.g) %>% as_tibble() %>% 
+   dplyr::select(gene = Var1, segregant = Var2, 
+                 expression = value) %>%
+   mutate(condition = "glucose")
> exp_e <- melt(exp.e) %>% as_tibble() %>% 
+   dplyr::select(gene = Var1, segregant = Var2, 
+                 expression = value) %>%
+   mutate(condition = "ethanol")
> print(exp_e, n=4)
# A tibble: 488,538 x 4
  gene    segregant expression condition
  <fct>   <fct>          <dbl> <chr>    
1 YJR107W 20_4_c          0.06 ethanol  
2 YPL270W 20_4_c         -0.13 ethanol  
3 YDR518W 20_4_c         -0.94 ethanol  
4 YDR233C 20_4_c          0.04 ethanol  
# … with 4.885e+05 more rows

10.1.8 Combine Into Single Data Frame

Combine gene expression data from two conditions into a single data frame.

> exp_all <- bind_rows(exp_g, exp_e)
> sample_n(exp_all, size=10)
# A tibble: 10 x 4
   gene    segregant expression condition
   <fct>   <fct>          <dbl> <chr>    
 1 YAL039C 5_4_d          0.290 glucose  
 2 YFL034W 13_1_a        -0.08  glucose  
 3 YHR138C 17_5_b        -0.49  glucose  
 4 YOR122C 8_3_a          0.15  glucose  
 5 YOR330C 5_1_d         -0.404 ethanol  
 6 YNL246W 9_4_d          0.36  glucose  
 7 YPL146C 13_5_c        -0.06  glucose  
 8 YLR401C 9_5_d         -0.61  ethanol  
 9 YNL177C 22_3_b        -0.1   glucose  
10 YGR125W 11_2_d        -0.05  ethanol  

10.1.9 Join Gene Positions

Now we want to join the gene positions with the expression data.

> exp_all <- exp_all %>%
+   mutate(gene = as.character(gene), 
+          segregant = as.character(segregant))
> sk_tidy <- exp_all %>%
+   left_join(gene_pos, by = "gene")
> sample_n(sk_tidy, size=7)
# A tibble: 7 x 7
  gene    segregant expression condition   chr  start    end
  <chr>   <chr>          <dbl> <chr>     <int>  <int>  <int>
1 YGL098W 14_4_a          0.26 ethanol       7 317345 318082
2 YGR058W 14_5_b         -0.23 glucose       7 606140 607147
3 YOR217W 5_5_d          -0.37 ethanol      15 749302 751887
4 YIL057C 9_1_d          -3.99 glucose       9 248393 247899
5 YMR316W 2_6_d           0.44 glucose      13 904823 905833
6 YCL035C 14_4_a          0.91 glucose       3  61173  60841
7 YLR020C 2_3_d           0.04 ethanol      12 183404 181788

10.1.10 Apply dplyr Functions

Now that we have the data made tidy in the data frame sk_tidy, let’s apply some dplyr operations…

Does each gene have the same number of observations?

> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   summary()
     gene               value      
 Length:4478        Min.   :218.0  
 Class :character   1st Qu.:218.0  
 Mode  :character   Median :218.0  
                    Mean   :218.6  
                    3rd Qu.:218.0  
                    Max.   :872.0  

No, so let’s see which genes have more than one set of observations.

> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   filter(value > median(value))
# A tibble: 4 x 2
  gene      value
  <chr>     <int>
1 YFR024C-A   872
2 YJL012C     872
3 YKL198C     872
4 YPR089W     872

Let’s remove replicated measurements for these genes.

> sk_tidy %<>% distinct(gene, segregant, condition, 
+                       .keep_all = TRUE)
> 
> sk_tidy %>% group_by(gene) %>% 
+   summarize(value = n()) %>%
+   summary()
     gene               value    
 Length:4478        Min.   :218  
 Class :character   1st Qu.:218  
 Mode  :character   Median :218  
                    Mean   :218  
                    3rd Qu.:218  
                    Max.   :218  

As an exercise, think about how you would use dplyr to replace the replicated gene expression values with a single averaged expression value for these genes.

Get the mean and standard deviation expression per chromosome.

> sk_tidy %>%
+   group_by(chr) %>%
+   summarize(mean = mean(expression), sd=sd(expression))
# A tibble: 16 x 3
     chr    mean    sd
   <int>   <dbl> <dbl>
 1     1 -0.0762 0.826
 2     2 -0.0447 0.632
 3     3 -0.0230 0.682
 4     4 -0.0233 0.537
 5     5 -0.0579 0.610
 6     6 -0.0772 0.660
 7     7 -0.0441 0.617
 8     8 -0.0474 0.638
 9     9 -0.0430 0.614
10    10 -0.0299 0.570
11    11 -0.0396 0.613
12    12 -0.0515 0.643
13    13 -0.0265 0.584
14    14 -0.0294 0.642
15    15 -0.0130 0.554
16    16 -0.0368 0.604

Get the mean and standard deviation expression per chromosome in each condition.

> sk_tidy %>%
+   group_by(chr, condition) %>%
+   summarize(mean = mean(expression), sd=sd(expression))
# A tibble: 32 x 4
# Groups:   chr [16]
     chr condition      mean    sd
   <int> <chr>         <dbl> <dbl>
 1     1 ethanol    0.0260   0.480
 2     1 glucose   -0.178    1.05 
 3     2 ethanol    0.0132   0.479
 4     2 glucose   -0.103    0.750
 5     3 ethanol    0.000164 0.536
 6     3 glucose   -0.0461   0.800
 7     4 ethanol    0.00187  0.482
 8     4 glucose   -0.0484   0.586
 9     5 ethanol   -0.0297   0.479
10     5 glucose   -0.0862   0.716
# … with 22 more rows

Count the number of genes per chromosome.

> sk_tidy %>%
+   filter(condition == "glucose", segregant == "20_4_c") %>%
+   group_by(chr) %>% 
+   summarize(num.genes = n())
# A tibble: 16 x 2
     chr num.genes
   <int>     <int>
 1     1        60
 2     2       298
 3     3       125
 4     4       629
 5     5       207
 6     6        79
 7     7       395
 8     8       209
 9     9       152
10    10       256
11    11       241
12    12       387
13    13       367
14    14       319
15    15       388
16    16       366

Filter for the first gene on every chromosome.

> sk_tidy %>%
+   filter(condition == "glucose", segregant == "20_4_c") %>%
+   group_by(chr) %>%
+   filter(start == min(start))
# A tibble: 16 x 7
# Groups:   chr [16]
   gene    segregant expression condition   chr start   end
   <chr>   <chr>          <dbl> <chr>     <int> <int> <int>
 1 YHL040C 20_4_c        -2.79  glucose       8 20968 19085
 2 YNL334C 20_4_c        -0.9   glucose      14 12876 12208
 3 YOL157C 20_4_c        -1.06  glucose      15 24293 22524
 4 YKL222C 20_4_c         0.09  glucose      11  5621  3504
 5 YIL168W 20_4_c        -1.14  glucose       9 29032 29415
 6 YJL213W 20_4_c         0.84  glucose      10 32163 33158
 7 YPL272C 20_4_c        -0.18  glucose      16 28164 26611
 8 YLL063C 20_4_c        -0.66  glucose      12 16072 14648
 9 YFL048C 20_4_c        -0.09  glucose       6 40180 38843
10 YML132W 20_4_c        -0.21  glucose      13  7244  8383
11 YGL261C 20_4_c        -0.14  glucose       7  6652  6290
12 YBL107C 20_4_c         0.290 glucose       2 10551  9961
13 YDL248W 20_4_c        -0.68  glucose       4  1802  2953
14 YEL073C 20_4_c        -0.02  glucose       5  7553  7230
15 YAL062W 20_4_c        -5.64  glucose       1 31568 32941
16 YCL068C 20_4_c         0.47  glucose       3 12285 11503

To plot expression in glucose versus ethanol we first need to use dcast().

> sk_tidy %>% dcast(gene + segregant ~ condition, 
+                   value.var = "expression") %>%
+   as_tibble()
# A tibble: 488,102 x 4
   gene    segregant ethanol glucose
   <chr>   <chr>       <dbl>   <dbl>
 1 YAL002W 1_1_d        0.37   -0.01
 2 YAL002W 1_3_d        0.23    0.03
 3 YAL002W 1_4_d        0.08    0.07
 4 YAL002W 1_5_c       -0.12    0.13
 5 YAL002W 10_1_c       0.12   -0.1 
 6 YAL002W 10_2_d       0.1    -0.2 
 7 YAL002W 10_3_c       0.07   -0.15
 8 YAL002W 10_4_d       0.06   -0.04
 9 YAL002W 11_1_a       0.07   -0.07
10 YAL002W 11_2_d       0.3     0.1 
# … with 488,092 more rows
> sk_tidy %>% dcast(gene + segregant ~ condition, 
+                   value.var = "expression") %>%
+   filter(gene == "YAL002W") %>%
+   ggplot(aes(x = glucose, y = ethanol)) +
+   geom_point() +  theme_bw() +
+   theme(legend.position = "none")