Title: | Tidy Verbs for Dealing with Genomic Data Frames |
---|---|
Description: | Handle genomic data within data frames just as you would with 'GRanges'. This packages provides method to deal with genomic intervals the "tidy-way" which makes it simpler to integrate in the the general data munging process. The API is inspired by the popular 'bedtools' and the genome_join() method from the 'fuzzyjoin' package. |
Authors: | Constantin Ahlmann-Eltze [aut, cre] , Stan Developers [cph] (Code from the Stan Math library is reused in 'cluster_interval.cpp'), David Robinson [cph] (Code from the fuzzyjoin package is reused) |
Maintainer: | Constantin Ahlmann-Eltze <[email protected]> |
License: | GPL-3 |
Version: | 0.1.2 |
Built: | 2024-11-08 05:14:27 UTC |
Source: | https://github.com/const-ae/tidygenomics |
Cluster ranges which are implemented as 2 equal-length numeric vectors.
cluster_interval(starts, ends, max_distance = 0L)
cluster_interval(starts, ends, max_distance = 0L)
starts |
A numeric vector that defines the starts of each interval |
ends |
A numeric vector that defines the ends of each interval |
max_distance |
The maximum distance up to which intervals are still considered to be the same cluster. Default: 0. |
starts <- c(50, 100, 120) ends <- c(75, 130, 150) j <- cluster_interval(starts, ends) j == c(0,1,1)
starts <- c(50, 100, 120) ends <- c(75, 130, 150) j <- cluster_interval(starts, ends) j == c(0,1,1)
Intersect data frames based on chromosome, start and end.
genome_cluster(x, by = NULL, max_distance = 0, cluster_column_name = "cluster_id")
genome_cluster(x, by = NULL, max_distance = 0, cluster_column_name = "cluster_id")
x |
A dataframe. |
by |
A character vector with 3 entries which are the chromosome, start and end column.
For example: |
max_distance |
The maximum distance up to which intervals are still considered to be the same cluster. Default: 0. |
cluster_column_name |
A string that is used as the new column name |
The dataframe with the additional column of the cluster
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 120, 300, 260), end = c(150, 250, 350, 450)) genome_cluster(x1, by=c("chromosome", "start", "end")) genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10)
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 120, 300, 260), end = c(150, 250, 350, 450)) genome_cluster(x1, by=c("chromosome", "start", "end")) genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10)
chromosome_size
data frame
that contains 2 or 3 columns, the first the names of chromosome and in case
there are 2 columns the size or first the start index and lastly the end index
on the chromosome.Calculates the complement to the intervals covered by the intervals in
a data frame. It can optionally take a chromosome_size
data frame
that contains 2 or 3 columns, the first the names of chromosome and in case
there are 2 columns the size or first the start index and lastly the end index
on the chromosome.
genome_complement(x, chromosome_size = NULL, by = NULL)
genome_complement(x, chromosome_size = NULL, by = NULL)
x |
A data frame for which the complement is calculated |
chromosome_size |
A dataframe with at least 2 columns that contains
first the chromosome name and then the size of that chromosome. Can be NULL
in which case the largest value per chromosome from |
by |
A character vector with 3 entries which are the chromosome, start and end column.
For example: |
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) genome_complement(x1, by=c("chromosome", "start", "end"))
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) genome_complement(x1, by=c("chromosome", "start", "end"))
Intersect data frames based on chromosome, start and end.
genome_intersect(x, y, by = NULL, mode = "both")
genome_intersect(x, y, by = NULL, mode = "both")
x |
A dataframe. |
y |
A dataframe. |
by |
A character vector with 3 entries which are used to match the chromosome, start and end column.
For example: |
mode |
One of "both", "left", "right" or "anti". |
The intersected dataframe of x
and y
with the new boundaries.
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr2"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr2", "chr1"), start = c(140, 210, 400, 300), end = c(160, 240, 415, 320)) j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") print(j)
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr2"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr2", "chr1"), start = c(140, 210, 400, 300), end = c(160, 240, 415, 320)) j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") print(j)
Join intervals on chromosomes in data frames, to the closest partner
genome_join_closest(x, y, by = NULL, mode = "inner", distance_column_name = NULL, max_distance = Inf, select = "all") genome_inner_join_closest(x, y, by = NULL, ...) genome_left_join_closest(x, y, by = NULL, ...) genome_right_join_closest(x, y, by = NULL, ...) genome_full_join_closest(x, y, by = NULL, ...) genome_semi_join_closest(x, y, by = NULL, ...) genome_anti_join_closest(x, y, by = NULL, ...)
genome_join_closest(x, y, by = NULL, mode = "inner", distance_column_name = NULL, max_distance = Inf, select = "all") genome_inner_join_closest(x, y, by = NULL, ...) genome_left_join_closest(x, y, by = NULL, ...) genome_right_join_closest(x, y, by = NULL, ...) genome_full_join_closest(x, y, by = NULL, ...) genome_semi_join_closest(x, y, by = NULL, ...) genome_anti_join_closest(x, y, by = NULL, ...)
x |
A dataframe. |
y |
A dataframe. |
by |
A character vector with 3 entries which are used to match the chromosome, start and end column.
For example: |
mode |
One of "inner", "full", "left", "right", "semi" or "anti". |
distance_column_name |
A string that is used as the new column name with the distance.
If |
max_distance |
The maximum distance that is allowed to join 2 entries. |
select |
A string that is passed on to |
... |
Additional arguments parsed on to genome_join_closest. |
The joined dataframe of x
and y
.
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr2"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr2", "chr1"), start = c(140, 210, 400, 300), end = c(160, 240, 415, 320)) j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") print(j)
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr2"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr2", "chr1"), start = c(140, 210, 400, 300), end = c(160, 240, 415, 320)) j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") print(j)
Subtract one data frame from another based on chromosome, start and end.
genome_subtract(x, y, by = NULL)
genome_subtract(x, y, by = NULL)
x |
A dataframe. |
y |
A dataframe. |
by |
A character vector with 3 entries which are used to match the chromosome, start and end column.
For example: |
The subtracted dataframe of x
and y
with the new boundaries.
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr1", "chr1"), start = c(120, 210, 300, 400), end = c(125, 240, 320, 415)) j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) print(j)
library(dplyr) x1 <- data.frame(id = 1:4, bla=letters[1:4], chromosome = c("chr1", "chr1", "chr2", "chr1"), start = c(100, 200, 300, 400), end = c(150, 250, 350, 450)) x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], chromosome = c("chr1", "chr2", "chr1", "chr1"), start = c(120, 210, 300, 400), end = c(125, 240, 320, 415)) j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) print(j)