76981

For each value of a column divide that value by the number of rows that have the same sequence in an

Question:

This is a part of my dataset:

structure(list(V1 = c("t00000406", "t00000517", "t00000519", "t00000589", "t00000589", "t00000598", "t00000804", "t00000938", "t00001008", "t00001156", "t00001156", "t00001156", "t00001165", "t00001165", "t00001165", "t00001265", "t00001265", "t00001265", "t00001511", "t00001562", "t00001562", "t00001599", "t00001703", "t00001703", "t00001703", "t00001710", "t00001710", "t00001710", "t00001710"), V2 = c(617L, 445L, 439L, 357L, 357L, 352L, 234L, 192L, 177L, 151L, 151L, 151L, 149L, 149L, 149L, 138L, 138L, 138L, 114L, 111L, 111L, 108L, 101L, 101L, 101L, 101L, 101L, 101L, 101L ), V4 = c("piR-hsa-3546", "piR-hsa-3454", "piR-hsa-3546", "piR-hsa-6909", "piR-hsa-6908", "piR-hsa-3454", "piR-hsa-3454", "piR-hsa-3454", "piR-hsa-3454", "piR-hsa-31261", "piR-hsa-14100", "piR-hsa-14099", "piR-hsa-28592", "piR-hsa-6592", "piR-hsa-6591", "piR-hsa-14099", "piR-hsa-31261", "piR-hsa-14100", "piR-hsa-6909", "piR-hsa-16270", "piR-hsa-16271", "piR-hsa-620", "piR-hsa-31261", "piR-hsa-14100", "piR-hsa-14099", "piR-hsa-14098", "piR-hsa-14100", "piR-hsa-14099", "piR-hsa-31261"), V6 = c("CTGTTAACCGAAAGGTTGGTGGT", "CACGTGTTAGGACCCGAAAGA", "CGGCTGTTAACCGAAAGGTTGGTGGT", "GTTTCCGTAGTGTAGTGGTCATC", "GTTTCCGTAGTGTAGTGGTCATC", "ACGTGTTAGGACCCGAAAGA", "CGTGTTAGGACCCGAAAGA", "TGTTAGGACCCGAAAGA", "CGCACGTGTTAGGACCCGAAAGA", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC", "GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG", "TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT", "GTTTCCGTAGTGTAGTGGTCATCACGTTCGCC", "CTGAGGGTCCAGGGT", "CTGAGGGTCCAGGGT", "CGTAGTTCCGACCATAAACGATGCC", "TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT")), row.names = c(NA, -29L), class = c("tbl_df", "tbl", "data.frame"))

I want to "mutate" each value of column V2 divide by a value computed by the number of times the same "sequence" exist in the data. ->

Sequence GTAGTCGTGGCCGAGTGGTTAAG exist 3 times

nrow(filter(my_data,V6=="GTAGTCGTGGCCGAGTGGTTAAG")) [1] 3

Result:

filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG") %>% mutate(V2=V2/nrow(filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG")) ) # A tibble: 3 x 4 V1 V2 V4 V6 <chr> <dbl> <chr> <chr> 1 t00001165 49.7 piR-hsa-28592 GTAGTCGTGGCCGAGTGGTTAAG 2 t00001165 49.7 piR-hsa-6592 GTAGTCGTGGCCGAGTGGTTAAG 3 t00001165 49.7 piR-hsa-6591 GTAGTCGTGGCCGAGTGGTTAAG

I thought of appending results to a new data frame with bind_rows but it must be another more "tidy" way of doing it.

Thank you for your time

Answer1:

df %>% group_by(V6) %>% mutate(V2 = V2 / n())

does the job. First we group by sequence and then divide by the size of this group, n().

Recommend