I have a large data set and I want to write a custom merge function to use with apply but I can't solve a certain issue. I can't use a loop as it will take too long. The data roughly looks like this;
# [ Name, Strand, Start, End ]
R1 = c( 'GeneA', '+', 1000, 1500 )
R2 = c( 'GeneA', '+', 1510, 2000 )
R3 = c( 'GeneA', '+', 2001, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
The data is a data.frame with rows R1:R4
So far I can get a function which compares Ri and Rj (j = i +1) and merges them if Name is the same, Strand is the same, and the gap between them is less then 100.
GAP = Ri[End] - Rj[Start]
If I apply the function to each row and build up the output. The function output then should create
R1 = c( 'GeneA', '+', 1000, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
I can get a working function which can use apply to merge two consecutive elements into one but I can't figure out how to merge three consecutive elements. One ugly solution was to run the two consecutive element merger function until no additional mergers are made but this is not efficient. I'm a bit stuck for ideas and any insights would be appreciated.
EDIT: to clarify, the data is ordered by Start position on consecutive chromosomes (not shown) and each gene name occurs multiple times in the data set at different positions (ie. GeneA can be 1000-2500 and then again at 4000-5000, and I don't want to merge those two genes, only consecutive ones).
EDIT 2: I have used Tim P solution below. A problem has arisen in the efficiency of Merging. Also is there a way to post text files to stack overflow so I can show what the data really looks like and then post my script so far?
# Let ValidMerge be a logical vector of MergeDown corresponding to the rows in the data
# as defined by TimP below
# The data.frame is called RMDB
# TeMerge function is previously defined to merge two rows Ri and Rj into one entry
# which spans the start of Ri to the end of Rj (with same name and strand)
COUNT = 0
RMDB.OUT = 0
while (COUNT < nrow(RMDB)) { # Cycle through all rows of RMDB
COUNT = COUNT + 1
# Is this position a merger start?
# If yes, then returns position in startpt
# which will be the end position in endpt
# If no, returns 0
Merge = match(COUNT,startpt,nomatch=0)
if ( Merge == 0 ){
# No merge starts at this position
RMDB.OUT = rbind(RMDB.OUT,RMDB[COUNT,]) # Append COUNT Row to output
}else{
# Merge COUNT row in RMDB to its endpoint
RMDB.OUT = rbind(RMDB.OUT,
TeMerge(RMDB[COUNT,],RMDB[endpt[Merge],]))
#print(paste('Merging Row',COUNT,' and Row',endpt[Merge]))
COUNT = endpt[Merge] # Move Count to the endpoint
}
}
This script gives exactly the results I am interested in, the only problem is that my data.frame has 5 000 000 entries and I would like to run the analysis using several parameters for GAP size to compare the results. Is there a way which I can rewrite this part of the code to be more efficient? Everything up until this point runs in reasonable amount of time (~couple of minutes). This part of has been going for 3+ hours on a 700 000 subset of the data.
EDIT 3: The spiked data which has all cases at the top (MIR3). Ignore Columns 1:4,8,11:15.
dput(RMDB) structure(list(V1 = c(3612L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 741L, 444L, 741L, 407L, 2059L, 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 18.7, 11.6, 24.9, 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7, 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 12.9, 38.5, 35.4, 27.8, 19.2, 0, 17.3, 21.2, 19.3, 0, 3.9, 26.6, 22.6), V3 = c(27, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3, 1, 0, 0, 0, 1.8, 9.5), V4 = c(1.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.1, 1.1, 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5, 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 20210L, 21000L, 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24405L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L, 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L, 40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L, 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), V9 = structure(c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L, 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L ), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), V11 = structure(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3L, 12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L, 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1", "LINE/L2", "Low_complexity", "LTR", "LTR/ERV1", "LTR/ERVL", "LTR/ERVL-MaLR", "Satellite/telo", "Simple_repeat", "SINE/Alu", "SINE/MIR", "XXXXXXXX"), class = "factor"), V12 = structure(c(19L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L, 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)", "(115)", "(119)", "(13)", "131", "173", "2", "218", "2234", "(231)", "2705", "2923", "2970", "3242", "359", "3715", "(399)", "(4)", "5306", "5334", "5746", "6167", "67", "(685)", "7"), class = "factor"), V13 = c(1712L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 162L, 96L, 349L, 217L, 298L, 238L, 216L, 6174L, 44L, 6154L, 3030L, 3156L, 448L, 255L, 133L, 2623L, 3375L, 2846L, 1489L, 172L, 301L, 666L, 3217L, 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L), V14 = structure(c(23L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 8L, 1L, 10L, 25L, 4L, 13L, 21L, 1L, 11L, 26L, 15L, 14L, 20L, 30L, 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c("(0)", "(1)", "(11)", "(14)", "(179)", "208", "(209)", "(212)", "232", "(25)", "(255)", "3", "(30)", "3049", "(3116)", "(32)", "327", "(388)", "4016", "41", "(46)", "(470)", "483", "49", "(51)", "5640", "(573)", "(691)", "(838)", "91"), class = "factor"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 22L, 23L, 22L, 24L, 25L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15"), class = "data.frame", row.names = c(NA, -50L))
And here is the output after ValidMerge is applied.
dput(RMDB.OUT) structure(list(V1 = c(3612L, NA, NA, 318L, 318L, 318L, 318L, 318L, 318L, NA, 741L, 444L, 741L, 407L, 2059L, 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, NA, NA, 23, 23, 23, 23, 23, 23, NA, 18.7, 11.6, 24.9, 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7, 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 12.9, 38.5, 35.4, 27.8, 19.2, 0, 17.3, 21.2, 19.3, 0, 3.9, 26.6, 22.6), V3 = c(27, NA, NA, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, NA, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3, 1, 0, 0, 0, 1.8, 9.5 ), V4 = c(1.3, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 3.1, 1.1, 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5, 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 21000L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L, 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L, 40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20400L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L, 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), V9 = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L, 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), V11 = structure(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3L, 12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L, 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1", "LINE/L2", "Low_complexity", "LTR", "LTR/ERV1", "LTR/ERVL", "LTR/ERVL-MaLR", "Satellite/telo", "Simple_repeat", "SINE/Alu", "SINE/MIR", "XXXXXXXX"), class = "factor"), V12 = structure(c(19L, NA, NA, 5L, 5L, 5L, 5L, 5L, 5L, NA, 2L, 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L, 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)", "(115)", "(119)", "(13)", "131", "173", "2", "218", "2234", "(231)", "2705", "2923", "2970", "3242", "359", "3715", "(399)", "(4)", "5306", "5334", "5746", "6167", "67", "(685)", "7"), class = "factor"), V13 = c(1712L, NA, NA, 143L, 143L, 143L, 143L, 143L, 143L, NA, 162L, 96L, 349L, 217L, 298L, 238L, 216L, 6174L, 44L, 6154L, 3030L, 3156L, 448L, 255L, 133L, 2623L, 3375L, 2846L, 1489L, 172L, 301L, 666L, 3217L, 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L), V14 = structure(c(23L, NA, NA, 24L, 24L, 24L, 24L, 24L, 24L, NA, 8L, 1L, 10L, 25L, 4L, 13L, 21L, 1L, 11L, 26L, 15L, 14L, 20L, 30L, 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c("(0)", "(1)", "(11)", "(14)", "(179)", "208", "(209)", "(212)", "232", "(25)", "(255)", "3", "(30)", "3049", "(3116)", "(32)", "327", "(388)", "4016", "41", "(46)", "(470)", "483", "49", "(51)", "5640", "(573)", "(691)", "(838)", "91"), class = "factor"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 22L, 23L, 22L, 24L, 25L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15"), class = "data.frame", row.names = c(NA, -46L))
Edit 4: Sorry about that, here is the simplified version: Initial Data.frame
dput(RMDB.cut)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), Start = c(10469L, 20001L, 20210L, 21000L, 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24405L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L), End = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L),
(Left)
= structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), Strand = structure(c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = structure(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), ValidMerge = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE)), .Names = c("Chromosome", "Start", "End", "(Left)", "Strand", "repName", "ValidMerge"), row.names = c(NA, 20L), class = "data.frame")
And the output after merger
dput(RMDB.out.cut)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), Start = c("10469", "20001", "21000", "22000", "22201", "23000", "23201", "20000", "20001", "24001", "0", "30855", "30953", "31293", "31436", "31734"), End = c("11447", "20400", "21400", "22200", "22400", "23200", "23400", "20001", "20200", "24600", "0", "30952", "31131", "31435", "31733", "31754"),
(Left)
= structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)" ), class = "factor"), Strand = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = structure(c(32L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), ValidMerge = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE )), .Names = c("Chromosome", "Start", "End", "(Left)", "Strand", "repName", "ValidMerge"), row.names = c("2", "3", "4", "6", "7", "8", "9", "10", "11", "111", "15", "16", "17", "18", "19", "20" ), class = "data.frame")
?GRanges
and, e.g.,?reduce
, and the vignettes in this and the IRanges package (available from the web landing pages). – Martin Morgan